[llvm] [profcheck] Add unknown branch weights to expanded cmpxchg loop. (PR #165841)
Jin Huang via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 31 01:53:25 PDT 2025
https://github.com/jinhuang1102 updated https://github.com/llvm/llvm-project/pull/165841
>From 85eae1e7e1da2676b318c81e23daa99214d251d7 Mon Sep 17 00:00:00 2001
From: Jin Huang <jingold at google.com>
Date: Fri, 31 Oct 2025 08:41:49 +0000
Subject: [PATCH] [profcheck] Add unknown branch weights to expanded cmpxchg
loop.
---
llvm/lib/CodeGen/AtomicExpandPass.cpp | 11 +-
.../AtomicExpand/AArch64/atomicrmw-fp.ll | 32 +-
.../AtomicExpand/AArch64/pcsections.ll | 2736 +++++------
.../AMDGPU/expand-atomic-f32-agent.ll | 1678 +++----
.../AMDGPU/expand-atomic-f32-system.ll | 1466 +++---
.../AMDGPU/expand-atomic-f64-agent.ll | 1240 ++---
.../AMDGPU/expand-atomic-f64-system.ll | 1078 ++---
.../AMDGPU/expand-atomic-rmw-nand.ll | 6 +-
.../expand-atomic-simplify-cfg-CAS-block.ll | 6 +-
.../AMDGPU/expand-atomic-v2bf16-agent.ll | 4196 ++++++++++++++---
.../AMDGPU/expand-atomic-v2bf16-system.ll | 612 +--
.../AMDGPU/expand-atomic-v2f16-agent.ll | 4194 +++++++++++++---
.../AMDGPU/expand-atomic-v2f16-system.ll | 596 +--
.../AMDGPU/expand-atomicrmw-fp-vector.ll | 102 +-
.../AtomicExpand/ARM/atomicrmw-fp.ll | 4 +-
.../AtomicExpand/LoongArch/atomicrmw-fp.ll | 16 +-
.../AtomicExpand/Mips/atomicrmw-fp.ll | 4 +-
.../AtomicExpand/PowerPC/atomicrmw-fp.ll | 155 +-
.../AtomicExpand/RISCV/atomicrmw-fp.ll | 4 +-
.../Transforms/AtomicExpand/SPARC/libcalls.ll | 54 +-
.../AtomicExpand/X86/expand-atomic-rmw-fp.ll | 10 +-
.../X86/expand-atomic-rmw-initial-load.ll | 6 +-
.../AtomicExpand/X86/expand-atomic-xchg-fp.ll | 10 +-
23 files changed, 12040 insertions(+), 6176 deletions(-)
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 53f1cfe24a68d..dffb69425bb31 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -38,6 +38,7 @@
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
@@ -1259,8 +1260,7 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop(
BasicBlock *BB = Builder.GetInsertBlock();
Function *F = BB->getParent();
- assert(AddrAlign >=
- F->getDataLayout().getTypeStoreSize(ResultTy) &&
+ assert(AddrAlign >= F->getDataLayout().getTypeStoreSize(ResultTy) &&
"Expected at least natural alignment at this point.");
// Given: atomicrmw some_op iN* %addr, iN %incr ordering
@@ -1680,7 +1680,12 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
Loaded->addIncoming(NewLoaded, LoopBB);
- Builder.CreateCondBr(Success, ExitBB, LoopBB);
+ Instruction *CondBr = Builder.CreateCondBr(Success, ExitBB, LoopBB);
+
+ // Atomic RMW expands to a cmpxchg loop, Since precise branch weights
+ // cannot be easily determined here, we mark the branch as "unknown" (50/50)
+ // to prevent misleading optimizations.
+ setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, *F, DEBUG_TYPE);
Builder.SetInsertPoint(ExitBB, ExitBB->begin());
return NewLoaded;
diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
index 8ffacb9bdd5f6..fe42a5439857c 100644
--- a/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
@@ -14,7 +14,7 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret float [[TMP5]]
;
@@ -35,7 +35,7 @@ define float @test_atomicrmw_fsub_f32(ptr %ptr, float %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret float [[TMP5]]
;
@@ -56,7 +56,7 @@ define float @atomicrmw_fmin_float(ptr %ptr, float %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret float [[TMP6]]
;
@@ -77,7 +77,7 @@ define float @atomicrmw_fmax_float(ptr %ptr, float %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret float [[TMP6]]
;
@@ -98,7 +98,7 @@ define double @atomicrmw_fmin_double(ptr %ptr, double %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret double [[TMP6]]
;
@@ -119,7 +119,7 @@ define double @atomicrmw_fmax_double(ptr %ptr, double %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret double [[TMP6]]
;
@@ -140,7 +140,7 @@ define float @atomicrmw_fminimum_float(ptr %ptr, float %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret float [[TMP6]]
;
@@ -161,7 +161,7 @@ define float @atomicrmw_fmaximum_float(ptr %ptr, float %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret float [[TMP6]]
;
@@ -182,7 +182,7 @@ define double @atomicrmw_fminimum_double(ptr %ptr, double %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret double [[TMP6]]
;
@@ -203,7 +203,7 @@ define double @atomicrmw_fmaximum_double(ptr %ptr, double %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret double [[TMP6]]
;
@@ -224,7 +224,7 @@ define bfloat @atomicrmw_fmaximum_bfloat(ptr %ptr, bfloat %val) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i16, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i16 [[NEWLOADED]] to bfloat
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret bfloat [[TMP6]]
;
@@ -245,7 +245,7 @@ define half @atomicrmw_fmaximum_half(ptr %ptr, half %val) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i16, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i16 [[NEWLOADED]] to half
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret half [[TMP6]]
;
@@ -266,7 +266,7 @@ define <2 x half> @atomicrmw_fmaximum_2_x_half(ptr %ptr, <2 x half> %val) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <2 x half> [[TMP6]]
;
@@ -287,7 +287,7 @@ define bfloat @atomicrmw_fminimum_bfloat(ptr %ptr, bfloat %val) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i16, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i16 [[NEWLOADED]] to bfloat
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret bfloat [[TMP6]]
;
@@ -308,7 +308,7 @@ define half @atomicrmw_fminimum_half(ptr %ptr, half %val) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i16, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i16 [[NEWLOADED]] to half
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret half [[TMP6]]
;
@@ -329,7 +329,7 @@ define <2 x half> @atomicrmw_fminimum_2_x_half(ptr %ptr, <2 x half> %val) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <2 x half> [[TMP6]]
;
diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/pcsections.ll b/llvm/test/Transforms/AtomicExpand/AArch64/pcsections.ll
index c5c890559152d..5c60f21c207b9 100644
--- a/llvm/test/Transforms/AtomicExpand/AArch64/pcsections.ll
+++ b/llvm/test/Transforms/AtomicExpand/AArch64/pcsections.ll
@@ -4,7 +4,7 @@
define i8 @atomic8_load_unordered(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_load_unordered(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load atomic i8, ptr [[A:%.*]] unordered, align 1, !pcsections [[META0:![0-9]+]]
+; CHECK-NEXT: [[TMP0:%.*]] = load atomic i8, ptr [[A:%.*]] unordered, align 1, !pcsections [[META1:![0-9]+]]
; CHECK-NEXT: ret i8 [[TMP0]]
;
entry:
@@ -15,7 +15,7 @@ entry:
define i8 @atomic8_load_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_load_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load atomic i8, ptr [[A:%.*]] monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load atomic i8, ptr [[A:%.*]] monotonic, align 1, !pcsections [[META1]]
; CHECK-NEXT: ret i8 [[TMP0]]
;
entry:
@@ -26,7 +26,7 @@ entry:
define i8 @atomic8_load_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_load_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load atomic i8, ptr [[A:%.*]] acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load atomic i8, ptr [[A:%.*]] acquire, align 1, !pcsections [[META1]]
; CHECK-NEXT: ret i8 [[TMP0]]
;
entry:
@@ -37,7 +37,7 @@ entry:
define i8 @atomic8_load_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_load_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load atomic i8, ptr [[A:%.*]] seq_cst, align 1, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load atomic i8, ptr [[A:%.*]] seq_cst, align 1, !pcsections [[META1]]
; CHECK-NEXT: ret i8 [[TMP0]]
;
entry:
@@ -48,7 +48,7 @@ entry:
define void @atomic8_store_unordered(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_store_unordered(
; CHECK-NEXT: entry:
-; CHECK-NEXT: store atomic i8 0, ptr [[A:%.*]] unordered, align 1, !pcsections [[META0]]
+; CHECK-NEXT: store atomic i8 0, ptr [[A:%.*]] unordered, align 1, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -59,7 +59,7 @@ entry:
define void @atomic8_store_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_store_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: store atomic i8 0, ptr [[A:%.*]] monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT: store atomic i8 0, ptr [[A:%.*]] monotonic, align 1, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -70,7 +70,7 @@ entry:
define void @atomic8_store_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_store_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: store atomic i8 0, ptr [[A:%.*]] release, align 1, !pcsections [[META0]]
+; CHECK-NEXT: store atomic i8 0, ptr [[A:%.*]] release, align 1, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -81,7 +81,7 @@ entry:
define void @atomic8_store_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_store_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: store atomic i8 0, ptr [[A:%.*]] seq_cst, align 1, !pcsections [[META0]]
+; CHECK-NEXT: store atomic i8 0, ptr [[A:%.*]] seq_cst, align 1, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -92,14 +92,14 @@ entry:
define void @atomic8_xchg_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_xchg_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 monotonic monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 monotonic monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -111,14 +111,14 @@ entry:
define void @atomic8_add_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_add_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] monotonic monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] monotonic monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -130,14 +130,14 @@ entry:
define void @atomic8_sub_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_sub_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] monotonic monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] monotonic monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -149,14 +149,14 @@ entry:
define void @atomic8_and_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_and_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 monotonic monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 monotonic monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -168,14 +168,14 @@ entry:
define void @atomic8_or_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_or_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] monotonic monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] monotonic monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -187,14 +187,14 @@ entry:
define void @atomic8_xor_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_xor_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] monotonic monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] monotonic monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -206,14 +206,14 @@ entry:
define void @atomic8_nand_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_nand_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 monotonic monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 monotonic monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -225,14 +225,14 @@ entry:
define void @atomic8_xchg_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_xchg_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 acquire acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 acquire acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -244,14 +244,14 @@ entry:
define void @atomic8_add_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_add_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acquire acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acquire acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -263,14 +263,14 @@ entry:
define void @atomic8_sub_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_sub_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acquire acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acquire acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -282,14 +282,14 @@ entry:
define void @atomic8_and_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_and_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 acquire acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 acquire acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -301,14 +301,14 @@ entry:
define void @atomic8_or_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_or_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acquire acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acquire acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -320,14 +320,14 @@ entry:
define void @atomic8_xor_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_xor_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acquire acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acquire acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -339,14 +339,14 @@ entry:
define void @atomic8_nand_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_nand_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 acquire acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 acquire acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -358,14 +358,14 @@ entry:
define void @atomic8_xchg_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_xchg_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 release monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 release monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -377,14 +377,14 @@ entry:
define void @atomic8_add_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_add_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] release monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] release monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -396,14 +396,14 @@ entry:
define void @atomic8_sub_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_sub_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] release monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] release monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -415,14 +415,14 @@ entry:
define void @atomic8_and_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_and_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 release monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 release monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -434,14 +434,14 @@ entry:
define void @atomic8_or_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_or_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] release monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] release monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -453,14 +453,14 @@ entry:
define void @atomic8_xor_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_xor_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] release monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] release monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -472,14 +472,14 @@ entry:
define void @atomic8_nand_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_nand_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 release monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 release monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -491,14 +491,14 @@ entry:
define void @atomic8_xchg_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_xchg_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 acq_rel acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 acq_rel acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -510,14 +510,14 @@ entry:
define void @atomic8_add_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_add_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acq_rel acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acq_rel acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -529,14 +529,14 @@ entry:
define void @atomic8_sub_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_sub_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acq_rel acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acq_rel acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -548,14 +548,14 @@ entry:
define void @atomic8_and_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_and_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 acq_rel acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 acq_rel acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -567,14 +567,14 @@ entry:
define void @atomic8_or_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_or_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acq_rel acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acq_rel acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -586,14 +586,14 @@ entry:
define void @atomic8_xor_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_xor_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acq_rel acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acq_rel acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -605,14 +605,14 @@ entry:
define void @atomic8_nand_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_nand_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 acq_rel acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 acq_rel acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -624,14 +624,14 @@ entry:
define void @atomic8_xchg_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_xchg_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 seq_cst seq_cst, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 seq_cst seq_cst, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -643,14 +643,14 @@ entry:
define void @atomic8_add_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_add_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] seq_cst seq_cst, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] seq_cst seq_cst, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -662,14 +662,14 @@ entry:
define void @atomic8_sub_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_sub_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] seq_cst seq_cst, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] seq_cst seq_cst, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -681,14 +681,14 @@ entry:
define void @atomic8_and_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_and_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 seq_cst seq_cst, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 seq_cst seq_cst, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -700,14 +700,14 @@ entry:
define void @atomic8_or_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_or_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] seq_cst seq_cst, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] seq_cst seq_cst, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -719,14 +719,14 @@ entry:
define void @atomic8_xor_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_xor_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] seq_cst seq_cst, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] seq_cst seq_cst, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -738,14 +738,14 @@ entry:
define void @atomic8_nand_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_nand_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 seq_cst seq_cst, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 seq_cst seq_cst, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -757,9 +757,9 @@ entry:
define void @atomic8_cas_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_cas_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 monotonic monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 monotonic acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 monotonic seq_cst, align 1, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 monotonic monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 monotonic acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 monotonic seq_cst, align 1, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -772,9 +772,9 @@ entry:
define void @atomic8_cas_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_cas_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 acquire monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 acquire acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 acquire seq_cst, align 1, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 acquire monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 acquire acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 acquire seq_cst, align 1, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -787,9 +787,9 @@ entry:
define void @atomic8_cas_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_cas_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 release monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 release acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 release seq_cst, align 1, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 release monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 release acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 release seq_cst, align 1, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -802,9 +802,9 @@ entry:
define void @atomic8_cas_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_cas_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 acq_rel monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 acq_rel acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 acq_rel seq_cst, align 1, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 acq_rel monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 acq_rel acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 acq_rel seq_cst, align 1, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -817,9 +817,9 @@ entry:
define void @atomic8_cas_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic8_cas_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 seq_cst monotonic, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 seq_cst acquire, align 1, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 seq_cst seq_cst, align 1, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 seq_cst monotonic, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 seq_cst acquire, align 1, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 seq_cst seq_cst, align 1, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -832,7 +832,7 @@ entry:
define i16 @atomic16_load_unordered(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_load_unordered(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load atomic i16, ptr [[A:%.*]] unordered, align 2, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load atomic i16, ptr [[A:%.*]] unordered, align 2, !pcsections [[META1]]
; CHECK-NEXT: ret i16 [[TMP0]]
;
entry:
@@ -843,7 +843,7 @@ entry:
define i16 @atomic16_load_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_load_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load atomic i16, ptr [[A:%.*]] monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load atomic i16, ptr [[A:%.*]] monotonic, align 2, !pcsections [[META1]]
; CHECK-NEXT: ret i16 [[TMP0]]
;
entry:
@@ -854,7 +854,7 @@ entry:
define i16 @atomic16_load_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_load_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load atomic i16, ptr [[A:%.*]] acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load atomic i16, ptr [[A:%.*]] acquire, align 2, !pcsections [[META1]]
; CHECK-NEXT: ret i16 [[TMP0]]
;
entry:
@@ -865,7 +865,7 @@ entry:
define i16 @atomic16_load_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_load_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load atomic i16, ptr [[A:%.*]] seq_cst, align 2, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load atomic i16, ptr [[A:%.*]] seq_cst, align 2, !pcsections [[META1]]
; CHECK-NEXT: ret i16 [[TMP0]]
;
entry:
@@ -876,7 +876,7 @@ entry:
define void @atomic16_store_unordered(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_store_unordered(
; CHECK-NEXT: entry:
-; CHECK-NEXT: store atomic i16 0, ptr [[A:%.*]] unordered, align 2, !pcsections [[META0]]
+; CHECK-NEXT: store atomic i16 0, ptr [[A:%.*]] unordered, align 2, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -887,7 +887,7 @@ entry:
define void @atomic16_store_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_store_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: store atomic i16 0, ptr [[A:%.*]] monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT: store atomic i16 0, ptr [[A:%.*]] monotonic, align 2, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -898,7 +898,7 @@ entry:
define void @atomic16_store_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_store_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: store atomic i16 0, ptr [[A:%.*]] release, align 2, !pcsections [[META0]]
+; CHECK-NEXT: store atomic i16 0, ptr [[A:%.*]] release, align 2, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -909,7 +909,7 @@ entry:
define void @atomic16_store_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_store_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: store atomic i16 0, ptr [[A:%.*]] seq_cst, align 2, !pcsections [[META0]]
+; CHECK-NEXT: store atomic i16 0, ptr [[A:%.*]] seq_cst, align 2, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -920,14 +920,14 @@ entry:
define void @atomic16_xchg_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_xchg_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 monotonic monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 monotonic monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -939,14 +939,14 @@ entry:
define void @atomic16_add_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_add_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] monotonic monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] monotonic monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -958,14 +958,14 @@ entry:
define void @atomic16_sub_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_sub_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] monotonic monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] monotonic monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -977,14 +977,14 @@ entry:
define void @atomic16_and_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_and_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 monotonic monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 monotonic monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -996,14 +996,14 @@ entry:
define void @atomic16_or_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_or_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] monotonic monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] monotonic monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1015,14 +1015,14 @@ entry:
define void @atomic16_xor_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_xor_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] monotonic monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] monotonic monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1034,14 +1034,14 @@ entry:
define void @atomic16_nand_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_nand_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 monotonic monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 monotonic monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1053,14 +1053,14 @@ entry:
define void @atomic16_xchg_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_xchg_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 acquire acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 acquire acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1072,14 +1072,14 @@ entry:
define void @atomic16_add_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_add_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acquire acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acquire acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1091,14 +1091,14 @@ entry:
define void @atomic16_sub_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_sub_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acquire acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acquire acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1110,14 +1110,14 @@ entry:
define void @atomic16_and_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_and_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 acquire acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 acquire acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1129,14 +1129,14 @@ entry:
define void @atomic16_or_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_or_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acquire acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acquire acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1148,14 +1148,14 @@ entry:
define void @atomic16_xor_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_xor_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acquire acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acquire acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1167,14 +1167,14 @@ entry:
define void @atomic16_nand_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_nand_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 acquire acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 acquire acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1186,14 +1186,14 @@ entry:
define void @atomic16_xchg_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_xchg_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 release monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 release monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1205,14 +1205,14 @@ entry:
define void @atomic16_add_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_add_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] release monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] release monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1224,14 +1224,14 @@ entry:
define void @atomic16_sub_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_sub_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] release monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] release monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1243,14 +1243,14 @@ entry:
define void @atomic16_and_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_and_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 release monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 release monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1262,14 +1262,14 @@ entry:
define void @atomic16_or_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_or_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] release monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] release monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1281,14 +1281,14 @@ entry:
define void @atomic16_xor_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_xor_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] release monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] release monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1300,14 +1300,14 @@ entry:
define void @atomic16_nand_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_nand_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 release monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 release monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1319,14 +1319,14 @@ entry:
define void @atomic16_xchg_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_xchg_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 acq_rel acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 acq_rel acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1338,14 +1338,14 @@ entry:
define void @atomic16_add_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_add_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acq_rel acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acq_rel acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1357,14 +1357,14 @@ entry:
define void @atomic16_sub_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_sub_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acq_rel acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acq_rel acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1376,14 +1376,14 @@ entry:
define void @atomic16_and_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_and_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 acq_rel acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 acq_rel acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1395,14 +1395,14 @@ entry:
define void @atomic16_or_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_or_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acq_rel acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acq_rel acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1414,14 +1414,14 @@ entry:
define void @atomic16_xor_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_xor_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acq_rel acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acq_rel acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1433,14 +1433,14 @@ entry:
define void @atomic16_nand_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_nand_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 acq_rel acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 acq_rel acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1452,14 +1452,14 @@ entry:
define void @atomic16_xchg_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_xchg_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 seq_cst seq_cst, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 seq_cst seq_cst, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1471,14 +1471,14 @@ entry:
define void @atomic16_add_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_add_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] seq_cst seq_cst, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] seq_cst seq_cst, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1490,14 +1490,14 @@ entry:
define void @atomic16_sub_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_sub_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] seq_cst seq_cst, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] seq_cst seq_cst, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1509,14 +1509,14 @@ entry:
define void @atomic16_and_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_and_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 seq_cst seq_cst, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 seq_cst seq_cst, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1528,14 +1528,14 @@ entry:
define void @atomic16_or_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_or_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] seq_cst seq_cst, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] seq_cst seq_cst, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1547,14 +1547,14 @@ entry:
define void @atomic16_xor_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_xor_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] seq_cst seq_cst, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] seq_cst seq_cst, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1566,14 +1566,14 @@ entry:
define void @atomic16_nand_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_nand_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 seq_cst seq_cst, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 seq_cst seq_cst, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1585,9 +1585,9 @@ entry:
define void @atomic16_cas_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_cas_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 monotonic monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 monotonic acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 monotonic seq_cst, align 2, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 monotonic monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 monotonic acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 monotonic seq_cst, align 2, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -1600,9 +1600,9 @@ entry:
define void @atomic16_cas_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_cas_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 acquire monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 acquire acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 acquire seq_cst, align 2, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 acquire monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 acquire acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 acquire seq_cst, align 2, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -1615,9 +1615,9 @@ entry:
define void @atomic16_cas_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_cas_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 release monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 release acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 release seq_cst, align 2, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 release monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 release acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 release seq_cst, align 2, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -1630,9 +1630,9 @@ entry:
define void @atomic16_cas_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_cas_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 acq_rel monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 acq_rel acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 acq_rel seq_cst, align 2, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 acq_rel monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 acq_rel acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 acq_rel seq_cst, align 2, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -1645,9 +1645,9 @@ entry:
define void @atomic16_cas_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic16_cas_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 seq_cst monotonic, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 seq_cst acquire, align 2, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 seq_cst seq_cst, align 2, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 seq_cst monotonic, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 seq_cst acquire, align 2, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 seq_cst seq_cst, align 2, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -1660,7 +1660,7 @@ entry:
define i32 @atomic32_load_unordered(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_load_unordered(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load atomic i32, ptr [[A:%.*]] unordered, align 4, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load atomic i32, ptr [[A:%.*]] unordered, align 4, !pcsections [[META1]]
; CHECK-NEXT: ret i32 [[TMP0]]
;
entry:
@@ -1671,7 +1671,7 @@ entry:
define i32 @atomic32_load_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_load_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load atomic i32, ptr [[A:%.*]] monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load atomic i32, ptr [[A:%.*]] monotonic, align 4, !pcsections [[META1]]
; CHECK-NEXT: ret i32 [[TMP0]]
;
entry:
@@ -1682,7 +1682,7 @@ entry:
define i32 @atomic32_load_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_load_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load atomic i32, ptr [[A:%.*]] acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load atomic i32, ptr [[A:%.*]] acquire, align 4, !pcsections [[META1]]
; CHECK-NEXT: ret i32 [[TMP0]]
;
entry:
@@ -1693,7 +1693,7 @@ entry:
define i32 @atomic32_load_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_load_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load atomic i32, ptr [[A:%.*]] seq_cst, align 4, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load atomic i32, ptr [[A:%.*]] seq_cst, align 4, !pcsections [[META1]]
; CHECK-NEXT: ret i32 [[TMP0]]
;
entry:
@@ -1704,7 +1704,7 @@ entry:
define void @atomic32_store_unordered(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_store_unordered(
; CHECK-NEXT: entry:
-; CHECK-NEXT: store atomic i32 0, ptr [[A:%.*]] unordered, align 4, !pcsections [[META0]]
+; CHECK-NEXT: store atomic i32 0, ptr [[A:%.*]] unordered, align 4, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -1715,7 +1715,7 @@ entry:
define void @atomic32_store_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_store_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: store atomic i32 0, ptr [[A:%.*]] monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT: store atomic i32 0, ptr [[A:%.*]] monotonic, align 4, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -1726,7 +1726,7 @@ entry:
define void @atomic32_store_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_store_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: store atomic i32 0, ptr [[A:%.*]] release, align 4, !pcsections [[META0]]
+; CHECK-NEXT: store atomic i32 0, ptr [[A:%.*]] release, align 4, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -1737,7 +1737,7 @@ entry:
define void @atomic32_store_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_store_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: store atomic i32 0, ptr [[A:%.*]] seq_cst, align 4, !pcsections [[META0]]
+; CHECK-NEXT: store atomic i32 0, ptr [[A:%.*]] seq_cst, align 4, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -1748,14 +1748,14 @@ entry:
define void @atomic32_xchg_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_xchg_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 monotonic monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 monotonic monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1767,14 +1767,14 @@ entry:
define void @atomic32_add_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_add_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] monotonic monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] monotonic monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1786,14 +1786,14 @@ entry:
define void @atomic32_sub_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_sub_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] monotonic monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] monotonic monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1805,14 +1805,14 @@ entry:
define void @atomic32_and_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_and_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 monotonic monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 monotonic monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1824,14 +1824,14 @@ entry:
define void @atomic32_or_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_or_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] monotonic monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] monotonic monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1843,14 +1843,14 @@ entry:
define void @atomic32_xor_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_xor_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] monotonic monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] monotonic monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1862,14 +1862,14 @@ entry:
define void @atomic32_nand_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_nand_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 monotonic monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 monotonic monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1881,14 +1881,14 @@ entry:
define void @atomic32_xchg_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_xchg_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 acquire acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 acquire acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1900,14 +1900,14 @@ entry:
define void @atomic32_add_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_add_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acquire acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acquire acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1919,14 +1919,14 @@ entry:
define void @atomic32_sub_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_sub_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acquire acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acquire acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1938,14 +1938,14 @@ entry:
define void @atomic32_and_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_and_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 acquire acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 acquire acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1957,14 +1957,14 @@ entry:
define void @atomic32_or_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_or_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acquire acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acquire acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1976,14 +1976,14 @@ entry:
define void @atomic32_xor_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_xor_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acquire acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acquire acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -1995,14 +1995,14 @@ entry:
define void @atomic32_nand_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_nand_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 acquire acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 acquire acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2014,14 +2014,14 @@ entry:
define void @atomic32_xchg_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_xchg_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 release monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 release monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2033,14 +2033,14 @@ entry:
define void @atomic32_add_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_add_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] release monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] release monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2052,14 +2052,14 @@ entry:
define void @atomic32_sub_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_sub_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] release monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] release monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2071,14 +2071,14 @@ entry:
define void @atomic32_and_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_and_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 release monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 release monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2090,14 +2090,14 @@ entry:
define void @atomic32_or_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_or_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] release monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] release monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2109,14 +2109,14 @@ entry:
define void @atomic32_xor_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_xor_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] release monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] release monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2128,14 +2128,14 @@ entry:
define void @atomic32_nand_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_nand_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 release monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 release monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2147,14 +2147,14 @@ entry:
define void @atomic32_xchg_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_xchg_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 acq_rel acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 acq_rel acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2166,14 +2166,14 @@ entry:
define void @atomic32_add_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_add_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acq_rel acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acq_rel acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2185,14 +2185,14 @@ entry:
define void @atomic32_sub_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_sub_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acq_rel acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acq_rel acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2204,14 +2204,14 @@ entry:
define void @atomic32_and_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_and_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 acq_rel acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 acq_rel acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2223,14 +2223,14 @@ entry:
define void @atomic32_or_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_or_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acq_rel acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acq_rel acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2242,14 +2242,14 @@ entry:
define void @atomic32_xor_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_xor_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acq_rel acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acq_rel acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2261,14 +2261,14 @@ entry:
define void @atomic32_nand_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_nand_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 acq_rel acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 acq_rel acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2280,14 +2280,14 @@ entry:
define void @atomic32_xchg_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_xchg_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 seq_cst seq_cst, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 seq_cst seq_cst, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2299,14 +2299,14 @@ entry:
define void @atomic32_add_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_add_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] seq_cst seq_cst, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] seq_cst seq_cst, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2318,14 +2318,14 @@ entry:
define void @atomic32_sub_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_sub_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] seq_cst seq_cst, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] seq_cst seq_cst, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2337,14 +2337,14 @@ entry:
define void @atomic32_and_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_and_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 seq_cst seq_cst, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 seq_cst seq_cst, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2356,14 +2356,14 @@ entry:
define void @atomic32_or_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_or_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] seq_cst seq_cst, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] seq_cst seq_cst, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2375,14 +2375,14 @@ entry:
define void @atomic32_xor_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_xor_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] seq_cst seq_cst, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] seq_cst seq_cst, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2394,14 +2394,14 @@ entry:
define void @atomic32_nand_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_nand_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 seq_cst seq_cst, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 seq_cst seq_cst, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2413,9 +2413,9 @@ entry:
define void @atomic32_cas_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_cas_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 monotonic monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 monotonic acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 monotonic seq_cst, align 4, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 monotonic monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 monotonic acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 monotonic seq_cst, align 4, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -2428,9 +2428,9 @@ entry:
define void @atomic32_cas_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_cas_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 acquire monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 acquire acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 acquire seq_cst, align 4, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 acquire monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 acquire acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 acquire seq_cst, align 4, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -2443,9 +2443,9 @@ entry:
define void @atomic32_cas_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_cas_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 release monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 release acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 release seq_cst, align 4, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 release monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 release acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 release seq_cst, align 4, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -2458,9 +2458,9 @@ entry:
define void @atomic32_cas_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_cas_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 acq_rel monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 acq_rel acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 acq_rel seq_cst, align 4, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 acq_rel monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 acq_rel acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 acq_rel seq_cst, align 4, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -2473,9 +2473,9 @@ entry:
define void @atomic32_cas_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic32_cas_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 seq_cst monotonic, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 seq_cst acquire, align 4, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 seq_cst seq_cst, align 4, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 seq_cst monotonic, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 seq_cst acquire, align 4, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 seq_cst seq_cst, align 4, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -2488,7 +2488,7 @@ entry:
define i64 @atomic64_load_unordered(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_load_unordered(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] unordered, align 8, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] unordered, align 8, !pcsections [[META1]]
; CHECK-NEXT: ret i64 [[TMP0]]
;
entry:
@@ -2499,7 +2499,7 @@ entry:
define i64 @atomic64_load_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_load_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] monotonic, align 8, !pcsections [[META1]]
; CHECK-NEXT: ret i64 [[TMP0]]
;
entry:
@@ -2510,7 +2510,7 @@ entry:
define i64 @atomic64_load_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_load_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8, !pcsections [[META1]]
; CHECK-NEXT: ret i64 [[TMP0]]
;
entry:
@@ -2521,7 +2521,7 @@ entry:
define i64 @atomic64_load_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_load_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] seq_cst, align 8, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] seq_cst, align 8, !pcsections [[META1]]
; CHECK-NEXT: ret i64 [[TMP0]]
;
entry:
@@ -2532,7 +2532,7 @@ entry:
define ptr @atomic64_load_seq_cst_ptr_ty(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_load_seq_cst_ptr_ty(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load atomic ptr, ptr [[A:%.*]] seq_cst, align 8, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load atomic ptr, ptr [[A:%.*]] seq_cst, align 8, !pcsections [[META1]]
; CHECK-NEXT: ret ptr [[TMP0]]
;
entry:
@@ -2543,7 +2543,7 @@ entry:
define void @atomic64_store_unordered(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_store_unordered(
; CHECK-NEXT: entry:
-; CHECK-NEXT: store atomic i64 0, ptr [[A:%.*]] unordered, align 8, !pcsections [[META0]]
+; CHECK-NEXT: store atomic i64 0, ptr [[A:%.*]] unordered, align 8, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -2554,7 +2554,7 @@ entry:
define void @atomic64_store_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_store_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: store atomic i64 0, ptr [[A:%.*]] monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT: store atomic i64 0, ptr [[A:%.*]] monotonic, align 8, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -2565,7 +2565,7 @@ entry:
define void @atomic64_store_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_store_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: store atomic i64 0, ptr [[A:%.*]] release, align 8, !pcsections [[META0]]
+; CHECK-NEXT: store atomic i64 0, ptr [[A:%.*]] release, align 8, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -2576,7 +2576,7 @@ entry:
define void @atomic64_store_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_store_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: store atomic i64 0, ptr [[A:%.*]] seq_cst, align 8, !pcsections [[META0]]
+; CHECK-NEXT: store atomic i64 0, ptr [[A:%.*]] seq_cst, align 8, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -2587,7 +2587,7 @@ entry:
define void @atomic64_store_seq_cst_ptr_ty(ptr %a, ptr %v) nounwind uwtable {
; CHECK-LABEL: @atomic64_store_seq_cst_ptr_ty(
; CHECK-NEXT: entry:
-; CHECK-NEXT: store atomic ptr [[V:%.*]], ptr [[A:%.*]] seq_cst, align 8, !pcsections [[META0]]
+; CHECK-NEXT: store atomic ptr [[V:%.*]], ptr [[A:%.*]] seq_cst, align 8, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -2598,14 +2598,14 @@ entry:
define void @atomic64_xchg_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_xchg_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 monotonic monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 monotonic monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2617,14 +2617,14 @@ entry:
define void @atomic64_add_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_add_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] monotonic monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] monotonic monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2636,14 +2636,14 @@ entry:
define void @atomic64_sub_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_sub_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] monotonic monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] monotonic monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2655,14 +2655,14 @@ entry:
define void @atomic64_and_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_and_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 monotonic monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 monotonic monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2674,14 +2674,14 @@ entry:
define void @atomic64_or_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_or_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] monotonic monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] monotonic monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2693,14 +2693,14 @@ entry:
define void @atomic64_xor_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_xor_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] monotonic monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] monotonic monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2712,14 +2712,14 @@ entry:
define void @atomic64_nand_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_nand_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 monotonic monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 monotonic monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2731,14 +2731,14 @@ entry:
define void @atomic64_xchg_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_xchg_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 acquire acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 acquire acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2750,14 +2750,14 @@ entry:
define void @atomic64_add_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_add_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acquire acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acquire acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2769,14 +2769,14 @@ entry:
define void @atomic64_sub_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_sub_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acquire acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acquire acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2788,14 +2788,14 @@ entry:
define void @atomic64_and_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_and_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 acquire acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 acquire acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2807,14 +2807,14 @@ entry:
define void @atomic64_or_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_or_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acquire acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acquire acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2826,14 +2826,14 @@ entry:
define void @atomic64_xor_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_xor_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acquire acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acquire acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2845,14 +2845,14 @@ entry:
define void @atomic64_nand_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_nand_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 acquire acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 acquire acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2864,14 +2864,14 @@ entry:
define void @atomic64_xchg_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_xchg_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 release monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 release monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2883,14 +2883,14 @@ entry:
define void @atomic64_add_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_add_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] release monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] release monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2902,14 +2902,14 @@ entry:
define void @atomic64_sub_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_sub_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] release monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] release monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2921,14 +2921,14 @@ entry:
define void @atomic64_and_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_and_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 release monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 release monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2940,14 +2940,14 @@ entry:
define void @atomic64_or_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_or_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] release monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] release monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2959,14 +2959,14 @@ entry:
define void @atomic64_xor_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_xor_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] release monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] release monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2978,14 +2978,14 @@ entry:
define void @atomic64_nand_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_nand_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 release monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 release monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -2997,14 +2997,14 @@ entry:
define void @atomic64_xchg_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_xchg_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 acq_rel acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 acq_rel acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3016,14 +3016,14 @@ entry:
define void @atomic64_add_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_add_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acq_rel acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acq_rel acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3035,14 +3035,14 @@ entry:
define void @atomic64_sub_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_sub_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acq_rel acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acq_rel acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3054,14 +3054,14 @@ entry:
define void @atomic64_and_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_and_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 acq_rel acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 acq_rel acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3073,14 +3073,14 @@ entry:
define void @atomic64_or_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_or_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acq_rel acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acq_rel acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3092,14 +3092,14 @@ entry:
define void @atomic64_xor_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_xor_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acq_rel acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acq_rel acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3111,14 +3111,14 @@ entry:
define void @atomic64_nand_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_nand_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 acq_rel acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 acq_rel acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3130,14 +3130,14 @@ entry:
define void @atomic64_xchg_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_xchg_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 seq_cst seq_cst, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 seq_cst seq_cst, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3149,14 +3149,14 @@ entry:
define void @atomic64_add_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_add_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] seq_cst seq_cst, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] seq_cst seq_cst, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3168,14 +3168,14 @@ entry:
define void @atomic64_sub_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_sub_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] seq_cst seq_cst, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] seq_cst seq_cst, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3187,14 +3187,14 @@ entry:
define void @atomic64_and_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_and_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 seq_cst seq_cst, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 seq_cst seq_cst, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3206,14 +3206,14 @@ entry:
define void @atomic64_or_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_or_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] seq_cst seq_cst, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] seq_cst seq_cst, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3225,14 +3225,14 @@ entry:
define void @atomic64_xor_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_xor_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] seq_cst seq_cst, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] seq_cst seq_cst, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3244,14 +3244,14 @@ entry:
define void @atomic64_nand_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_nand_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 seq_cst seq_cst, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 seq_cst seq_cst, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3263,9 +3263,9 @@ entry:
define void @atomic64_cas_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_cas_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 monotonic monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 monotonic acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 monotonic seq_cst, align 8, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 monotonic monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 monotonic acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 monotonic seq_cst, align 8, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -3278,9 +3278,9 @@ entry:
define void @atomic64_cas_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_cas_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 acquire monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 acquire acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 acquire seq_cst, align 8, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 acquire monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 acquire acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 acquire seq_cst, align 8, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -3293,9 +3293,9 @@ entry:
define void @atomic64_cas_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_cas_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 release monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 release acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 release seq_cst, align 8, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 release monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 release acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 release seq_cst, align 8, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -3308,9 +3308,9 @@ entry:
define void @atomic64_cas_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_cas_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 acq_rel monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 acq_rel acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 acq_rel seq_cst, align 8, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 acq_rel monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 acq_rel acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 acq_rel seq_cst, align 8, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -3323,9 +3323,9 @@ entry:
define void @atomic64_cas_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic64_cas_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 seq_cst monotonic, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 seq_cst acquire, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 seq_cst seq_cst, align 8, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 seq_cst monotonic, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 seq_cst acquire, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 seq_cst seq_cst, align 8, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -3338,14 +3338,14 @@ entry:
define void @atomic64_cas_seq_cst_ptr_ty(ptr %a, ptr %v1, ptr %v2) nounwind uwtable {
; CHECK-LABEL: @atomic64_cas_seq_cst_ptr_ty(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[V1:%.*]] to i64, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[V2:%.*]] to i64, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A:%.*]], i64 [[TMP0]], i64 [[TMP1]] seq_cst seq_cst, align 8, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0, !pcsections [[META0]]
-; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[V1:%.*]] to i64, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[V2:%.*]] to i64, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[A:%.*]], i64 [[TMP0]], i64 [[TMP1]] seq_cst seq_cst, align 8, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0, !pcsections [[META1]]
+; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -3356,8 +3356,8 @@ entry:
define i128 @atomic128_load_unordered(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_load_unordered(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 0 monotonic monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[LOADED:%.*]] = extractvalue { i128, i1 } [[TMP0]], 0, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 0 monotonic monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[LOADED:%.*]] = extractvalue { i128, i1 } [[TMP0]], 0, !pcsections [[META1]]
; CHECK-NEXT: ret i128 [[LOADED]]
;
entry:
@@ -3368,8 +3368,8 @@ entry:
define i128 @atomic128_load_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_load_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 0 monotonic monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[LOADED:%.*]] = extractvalue { i128, i1 } [[TMP0]], 0, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 0 monotonic monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[LOADED:%.*]] = extractvalue { i128, i1 } [[TMP0]], 0, !pcsections [[META1]]
; CHECK-NEXT: ret i128 [[LOADED]]
;
entry:
@@ -3380,8 +3380,8 @@ entry:
define i128 @atomic128_load_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_load_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 0 acquire acquire, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[LOADED:%.*]] = extractvalue { i128, i1 } [[TMP0]], 0, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 0 acquire acquire, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[LOADED:%.*]] = extractvalue { i128, i1 } [[TMP0]], 0, !pcsections [[META1]]
; CHECK-NEXT: ret i128 [[LOADED]]
;
entry:
@@ -3392,8 +3392,8 @@ entry:
define i128 @atomic128_load_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_load_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 0 seq_cst seq_cst, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[LOADED:%.*]] = extractvalue { i128, i1 } [[TMP0]], 0, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 0 seq_cst seq_cst, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[LOADED:%.*]] = extractvalue { i128, i1 } [[TMP0]], 0, !pcsections [[META1]]
; CHECK-NEXT: ret i128 [[LOADED]]
;
entry:
@@ -3404,14 +3404,14 @@ entry:
define void @atomic128_store_unordered(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_store_unordered(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 monotonic monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 monotonic monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3423,14 +3423,14 @@ entry:
define void @atomic128_store_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_store_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 monotonic monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 monotonic monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3442,14 +3442,14 @@ entry:
define void @atomic128_store_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_store_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 release monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 release monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3461,14 +3461,14 @@ entry:
define void @atomic128_store_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_store_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 seq_cst seq_cst, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 seq_cst seq_cst, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3480,14 +3480,14 @@ entry:
define void @atomic128_xchg_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_xchg_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 monotonic monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 monotonic monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3499,14 +3499,14 @@ entry:
define void @atomic128_add_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_add_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] monotonic monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] monotonic monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3518,14 +3518,14 @@ entry:
define void @atomic128_sub_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_sub_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] monotonic monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] monotonic monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3537,14 +3537,14 @@ entry:
define void @atomic128_and_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_and_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 monotonic monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 monotonic monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3556,14 +3556,14 @@ entry:
define void @atomic128_or_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_or_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] monotonic monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] monotonic monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3575,14 +3575,14 @@ entry:
define void @atomic128_xor_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_xor_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] monotonic monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] monotonic monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3594,14 +3594,14 @@ entry:
define void @atomic128_nand_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_nand_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 monotonic monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 monotonic monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3613,14 +3613,14 @@ entry:
define void @atomic128_xchg_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_xchg_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 acquire acquire, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 acquire acquire, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3632,14 +3632,14 @@ entry:
define void @atomic128_add_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_add_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acquire acquire, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acquire acquire, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3651,14 +3651,14 @@ entry:
define void @atomic128_sub_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_sub_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acquire acquire, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acquire acquire, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3670,14 +3670,14 @@ entry:
define void @atomic128_and_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_and_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 acquire acquire, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 acquire acquire, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3689,14 +3689,14 @@ entry:
define void @atomic128_or_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_or_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acquire acquire, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acquire acquire, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3708,14 +3708,14 @@ entry:
define void @atomic128_xor_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_xor_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acquire acquire, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acquire acquire, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3727,14 +3727,14 @@ entry:
define void @atomic128_nand_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_nand_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 acquire acquire, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 acquire acquire, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3746,14 +3746,14 @@ entry:
define void @atomic128_xchg_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_xchg_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 release monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 release monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3765,14 +3765,14 @@ entry:
define void @atomic128_add_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_add_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] release monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] release monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3784,14 +3784,14 @@ entry:
define void @atomic128_sub_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_sub_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] release monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] release monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3803,14 +3803,14 @@ entry:
define void @atomic128_and_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_and_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 release monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 release monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3822,14 +3822,14 @@ entry:
define void @atomic128_or_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_or_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] release monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] release monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3841,14 +3841,14 @@ entry:
define void @atomic128_xor_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_xor_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] release monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] release monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3860,14 +3860,14 @@ entry:
define void @atomic128_nand_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_nand_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 release monotonic, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 release monotonic, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3879,14 +3879,14 @@ entry:
define void @atomic128_xchg_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_xchg_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 acq_rel acquire, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 acq_rel acquire, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3898,14 +3898,14 @@ entry:
define void @atomic128_add_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_add_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acq_rel acquire, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acq_rel acquire, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3917,14 +3917,14 @@ entry:
define void @atomic128_sub_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_sub_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acq_rel acquire, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acq_rel acquire, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3936,14 +3936,14 @@ entry:
define void @atomic128_and_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_and_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 acq_rel acquire, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 acq_rel acquire, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3955,14 +3955,14 @@ entry:
define void @atomic128_or_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_or_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acq_rel acquire, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acq_rel acquire, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3974,14 +3974,14 @@ entry:
define void @atomic128_xor_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_xor_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acq_rel acquire, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acq_rel acquire, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -3993,14 +3993,14 @@ entry:
define void @atomic128_nand_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_nand_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 acq_rel acquire, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 acq_rel acquire, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -4012,14 +4012,14 @@ entry:
define void @atomic128_xchg_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_xchg_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 seq_cst seq_cst, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 seq_cst seq_cst, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -4031,14 +4031,14 @@ entry:
define void @atomic128_add_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_add_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] seq_cst seq_cst, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] seq_cst seq_cst, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -4050,14 +4050,14 @@ entry:
define void @atomic128_sub_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_sub_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] seq_cst seq_cst, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] seq_cst seq_cst, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -4069,14 +4069,14 @@ entry:
define void @atomic128_and_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_and_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 seq_cst seq_cst, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 seq_cst seq_cst, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -4088,14 +4088,14 @@ entry:
define void @atomic128_or_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_or_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] seq_cst seq_cst, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] seq_cst seq_cst, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -4107,14 +4107,14 @@ entry:
define void @atomic128_xor_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_xor_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] seq_cst seq_cst, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] seq_cst seq_cst, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -4126,14 +4126,14 @@ entry:
define void @atomic128_nand_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_nand_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
-; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META1]]
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]], !pcsections [[META1]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
-; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 seq_cst seq_cst, align 16, !pcsections [[META0]]
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
+; CHECK-NEXT: [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META1]]
+; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 seq_cst seq_cst, align 16, !pcsections [[META1]]
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META1]]
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META1]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]], !pcsections [[META1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret void
;
@@ -4145,7 +4145,7 @@ entry:
define void @atomic128_cas_monotonic(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_cas_monotonic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 monotonic monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 monotonic monotonic, align 16, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -4156,7 +4156,7 @@ entry:
define void @atomic128_cas_acquire(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_cas_acquire(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 acquire acquire, align 16, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 acquire acquire, align 16, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -4167,7 +4167,7 @@ entry:
define void @atomic128_cas_release(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_cas_release(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 release monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 release monotonic, align 16, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -4178,7 +4178,7 @@ entry:
define void @atomic128_cas_acq_rel(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_cas_acq_rel(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 acq_rel acquire, align 16, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 acq_rel acquire, align 16, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
@@ -4189,7 +4189,7 @@ entry:
define void @atomic128_cas_seq_cst(ptr %a) nounwind uwtable {
; CHECK-LABEL: @atomic128_cas_seq_cst(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 seq_cst seq_cst, align 16, !pcsections [[META0]]
+; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 seq_cst seq_cst, align 16, !pcsections [[META1]]
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
index bfc9be32ad5fa..10ddace544ee1 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
@@ -15,7 +15,7 @@
; xchg is supported over PCIe, so no expansion is necessary
define float @test_atomicrmw_xchg_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
; COMMON-NEXT: ret float [[RES]]
;
@@ -26,8 +26,8 @@ define float @test_atomicrmw_xchg_f32_global_agent(ptr addrspace(1) %ptr, float
; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1:![0-9]+]]
; COMMON-NEXT: ret float [[RES]]
;
%res = atomicrmw xchg ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -37,8 +37,8 @@ define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_fine_grained_memor
; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: ret float [[RES]]
;
%res = atomicrmw xchg ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
@@ -48,8 +48,8 @@ define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_remote_memory(ptr
; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: ret float [[RES]]
;
%res = atomicrmw xchg ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -62,7 +62,7 @@ define float @test_atomicrmw_xchg_f32_global_agent__amdgpu_no_fine_grained_memor
define float @test_atomicrmw_fadd_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -74,12 +74,12 @@ define float @test_atomicrmw_fadd_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -91,12 +91,12 @@ define float @test_atomicrmw_fadd_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -108,12 +108,12 @@ define float @test_atomicrmw_fadd_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -125,17 +125,17 @@ define float @test_atomicrmw_fadd_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -147,12 +147,12 @@ define float @test_atomicrmw_fadd_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -164,12 +164,12 @@ define float @test_atomicrmw_fadd_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
; GFX12-NEXT: ret float [[TMP5]]
;
@@ -179,7 +179,7 @@ define float @test_atomicrmw_fadd_f32_global_agent(ptr addrspace(1) %ptr, float
define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -187,16 +187,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -204,16 +204,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -221,16 +221,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -238,21 +238,21 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -260,22 +260,22 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -284,7 +284,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -292,16 +292,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -309,16 +309,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -326,16 +326,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -343,21 +343,21 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -365,16 +365,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -382,17 +382,17 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
@@ -401,7 +401,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -409,16 +409,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -426,16 +426,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -443,16 +443,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -460,21 +460,21 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -482,22 +482,22 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -506,7 +506,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(ptr addrspace(1) %ptr, float %value) #0 {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -514,16 +514,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -531,16 +531,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -548,16 +548,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -565,21 +565,21 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -587,22 +587,22 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -611,7 +611,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -619,16 +619,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -636,16 +636,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -653,16 +653,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -670,21 +670,21 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -692,22 +692,22 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -716,7 +716,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -728,12 +728,12 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -745,12 +745,12 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -762,12 +762,12 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -779,17 +779,17 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -801,12 +801,12 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -818,13 +818,13 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
@@ -833,7 +833,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode(
define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -841,16 +841,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -858,16 +858,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -875,26 +875,26 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -902,22 +902,22 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -926,7 +926,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -934,16 +934,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -951,16 +951,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -968,16 +968,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -985,21 +985,21 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1007,16 +1007,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1024,17 +1024,17 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1043,7 +1043,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1051,16 +1051,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1068,16 +1068,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1085,26 +1085,26 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1112,22 +1112,22 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1136,7 +1136,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, float %value) #0 {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1144,16 +1144,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1161,16 +1161,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1178,26 +1178,26 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1205,22 +1205,22 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1229,7 +1229,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1237,16 +1237,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1254,16 +1254,16 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1271,26 +1271,26 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1298,22 +1298,22 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1326,7 +1326,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_ignore_denormal_mode_
define void @test_atomicrmw_fadd_noret_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1338,12 +1338,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent(ptr addrspace(1) %ptr, f
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1355,12 +1355,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent(ptr addrspace(1) %ptr, f
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1372,12 +1372,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent(ptr addrspace(1) %ptr, f
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1389,17 +1389,17 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent(ptr addrspace(1) %ptr, f
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1411,12 +1411,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent(ptr addrspace(1) %ptr, f
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1428,12 +1428,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent(ptr addrspace(1) %ptr, f
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
; GFX12-NEXT: ret void
;
@@ -1443,7 +1443,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent(ptr addrspace(1) %ptr, f
define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1451,16 +1451,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1468,16 +1468,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1485,16 +1485,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1502,21 +1502,21 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1524,22 +1524,22 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1548,7 +1548,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1556,16 +1556,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1573,16 +1573,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1590,16 +1590,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1607,21 +1607,21 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1629,16 +1629,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1646,17 +1646,17 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
@@ -1665,7 +1665,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_remote_memory
define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1673,16 +1673,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1690,16 +1690,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1707,16 +1707,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1724,21 +1724,21 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1746,22 +1746,22 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -1770,7 +1770,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(ptr addrspace(1) %ptr, float %value) #0 {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1778,16 +1778,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1795,16 +1795,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1812,16 +1812,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1829,21 +1829,21 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1851,22 +1851,22 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -1875,7 +1875,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1883,16 +1883,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1900,16 +1900,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1917,16 +1917,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1934,21 +1934,21 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1956,22 +1956,22 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -1980,7 +1980,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_no_fine_grained_
define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1992,12 +1992,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2009,12 +2009,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2026,12 +2026,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -2043,17 +2043,17 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2065,12 +2065,12 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2082,13 +2082,13 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
@@ -2097,7 +2097,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2105,16 +2105,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2122,31 +2122,31 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2154,22 +2154,22 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -2178,7 +2178,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2186,16 +2186,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2203,16 +2203,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2220,16 +2220,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -2237,21 +2237,21 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2259,16 +2259,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2276,17 +2276,17 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -2295,7 +2295,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2303,16 +2303,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2320,31 +2320,31 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2352,22 +2352,22 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -2376,7 +2376,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, float %value) #0 {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2384,16 +2384,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2401,31 +2401,31 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2433,22 +2433,22 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -2457,7 +2457,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2465,16 +2465,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2482,31 +2482,31 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2514,22 +2514,22 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -2542,7 +2542,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_agent__amdgpu_ignore_denormal_
define float @test_atomicrmw_fsub_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2554,7 +2554,7 @@ define float @test_atomicrmw_fsub_f32_global_agent(ptr addrspace(1) %ptr, float
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[RES]]
;
@@ -2564,7 +2564,7 @@ define float @test_atomicrmw_fsub_f32_global_agent(ptr addrspace(1) %ptr, float
define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2572,11 +2572,11 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memor
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[RES]]
;
@@ -2586,7 +2586,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memor
define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2594,11 +2594,11 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_remote_memory(ptr
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[RES]]
;
@@ -2608,7 +2608,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_remote_memory(ptr
define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2616,11 +2616,11 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memor
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[RES]]
;
@@ -2630,7 +2630,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_no_fine_grained_memor
define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2642,7 +2642,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode(
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[TMP5]]
;
@@ -2652,7 +2652,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode(
define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2660,11 +2660,11 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[TMP5]]
;
@@ -2674,7 +2674,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_
define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2682,11 +2682,11 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[TMP5]]
;
@@ -2696,7 +2696,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_
define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2704,11 +2704,11 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[TMP5]]
;
@@ -2722,7 +2722,7 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_
define float @test_atomicrmw_fmax_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2734,12 +2734,12 @@ define float @test_atomicrmw_fmax_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2751,12 +2751,12 @@ define float @test_atomicrmw_fmax_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2768,12 +2768,12 @@ define float @test_atomicrmw_fmax_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -2785,12 +2785,12 @@ define float @test_atomicrmw_fmax_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -2802,12 +2802,12 @@ define float @test_atomicrmw_fmax_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2819,12 +2819,12 @@ define float @test_atomicrmw_fmax_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP6]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2836,12 +2836,12 @@ define float @test_atomicrmw_fmax_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret float [[TMP6]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
; GFX12-NEXT: ret float [[RES]]
;
@@ -2851,7 +2851,7 @@ define float @test_atomicrmw_fmax_f32_global_agent(ptr addrspace(1) %ptr, float
define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2859,16 +2859,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2876,16 +2876,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2893,16 +2893,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -2910,16 +2910,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -2927,27 +2927,27 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2956,7 +2956,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2964,16 +2964,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2981,16 +2981,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2998,16 +2998,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -3015,16 +3015,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -3032,16 +3032,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -3049,16 +3049,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -3066,17 +3066,17 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
@@ -3085,7 +3085,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr
define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3093,16 +3093,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3110,16 +3110,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3127,16 +3127,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -3144,16 +3144,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -3161,27 +3161,27 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -3190,7 +3190,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memor
define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3202,12 +3202,12 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3219,12 +3219,12 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3236,12 +3236,12 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -3253,12 +3253,12 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -3270,12 +3270,12 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -3287,12 +3287,12 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP6]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -3304,13 +3304,13 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret float [[TMP6]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
@@ -3319,7 +3319,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3327,16 +3327,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3344,16 +3344,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3361,16 +3361,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -3378,16 +3378,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -3395,27 +3395,27 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -3424,7 +3424,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3432,16 +3432,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3449,16 +3449,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3466,16 +3466,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -3483,16 +3483,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -3500,16 +3500,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -3517,16 +3517,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -3534,17 +3534,17 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -3553,7 +3553,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3561,16 +3561,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3578,16 +3578,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3595,16 +3595,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -3612,16 +3612,16 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -3629,27 +3629,27 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -3662,7 +3662,7 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
define float @test_atomicrmw_fmin_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3674,12 +3674,12 @@ define float @test_atomicrmw_fmin_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3691,12 +3691,12 @@ define float @test_atomicrmw_fmin_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3708,12 +3708,12 @@ define float @test_atomicrmw_fmin_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -3725,12 +3725,12 @@ define float @test_atomicrmw_fmin_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -3742,12 +3742,12 @@ define float @test_atomicrmw_fmin_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -3759,12 +3759,12 @@ define float @test_atomicrmw_fmin_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP6]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -3776,12 +3776,12 @@ define float @test_atomicrmw_fmin_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret float [[TMP6]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
; GFX12-NEXT: ret float [[RES]]
;
@@ -3791,7 +3791,7 @@ define float @test_atomicrmw_fmin_f32_global_agent(ptr addrspace(1) %ptr, float
define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3799,16 +3799,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3816,16 +3816,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3833,16 +3833,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -3850,16 +3850,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -3867,27 +3867,27 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -3896,7 +3896,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3904,16 +3904,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3921,16 +3921,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3938,16 +3938,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -3955,16 +3955,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -3972,16 +3972,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -3989,16 +3989,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -4006,17 +4006,17 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
@@ -4025,7 +4025,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr
define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -4033,16 +4033,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -4050,16 +4050,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -4067,16 +4067,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -4084,16 +4084,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -4101,27 +4101,27 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -4130,7 +4130,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memor
define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -4142,12 +4142,12 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -4159,12 +4159,12 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -4176,12 +4176,12 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -4193,12 +4193,12 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -4210,12 +4210,12 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -4227,12 +4227,12 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP6]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -4244,13 +4244,13 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret float [[TMP6]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
@@ -4259,7 +4259,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -4267,16 +4267,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -4284,16 +4284,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -4301,16 +4301,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -4318,16 +4318,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -4335,27 +4335,27 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -4364,7 +4364,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -4372,16 +4372,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -4389,16 +4389,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -4406,16 +4406,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -4423,16 +4423,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -4440,16 +4440,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -4457,16 +4457,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -4474,17 +4474,17 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -4493,7 +4493,7 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -4501,16 +4501,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -4518,16 +4518,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -4535,16 +4535,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -4552,16 +4552,16 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -4569,27 +4569,27 @@ define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode_
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -4601,19 +4601,35 @@ attributes #1 = { "denormal-fp-mode-f32"="dynamic,dynamic" }
!0 = !{}
;.
-; GFX803: [[META0]] = !{}
+; GFX803: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX803: [[META1]] = !{}
+; GFX803: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX906: [[META0]] = !{}
+; GFX906: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX906: [[META1]] = !{}
+; GFX906: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX908: [[META0]] = !{}
+; GFX908: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX908: [[META1]] = !{}
+; GFX908: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX90A: [[META0]] = !{}
+; GFX90A: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX90A: [[META1]] = !{}
+; GFX90A: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX942: [[META0]] = !{}
+; GFX942: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX942: [[META1]] = !{}
+; GFX942: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX10: [[META0]] = !{}
+; GFX10: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX10: [[META1]] = !{}
+; GFX10: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX11: [[META0]] = !{}
+; GFX11: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX11: [[META1]] = !{}
+; GFX11: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX12: [[META0]] = !{}
+; GFX12: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX12: [[META1]] = !{}
+; GFX12: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
index dc751c255f263..28620bf93d2f1 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
@@ -15,7 +15,7 @@
; xchg is supported over PCIe, so no expansion is necessary
define float @test_atomicrmw_xchg_f32_global_system(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4
; COMMON-NEXT: ret float [[RES]]
;
@@ -26,8 +26,8 @@ define float @test_atomicrmw_xchg_f32_global_system(ptr addrspace(1) %ptr, float
; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1:![0-9]+]]
; COMMON-NEXT: ret float [[RES]]
;
%res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -37,8 +37,8 @@ define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memo
; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: ret float [[RES]]
;
%res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
@@ -48,8 +48,8 @@ define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_remote_memory(ptr
; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: ret float [[RES]]
;
%res = atomicrmw xchg ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -62,7 +62,7 @@ define float @test_atomicrmw_xchg_f32_global_system__amdgpu_no_fine_grained_memo
define float @test_atomicrmw_fadd_f32_global_system(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fadd_f32_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -74,7 +74,7 @@ define float @test_atomicrmw_fadd_f32_global_system(ptr addrspace(1) %ptr, float
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[TMP5]]
;
@@ -84,7 +84,7 @@ define float @test_atomicrmw_fadd_f32_global_system(ptr addrspace(1) %ptr, float
define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -92,16 +92,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -109,16 +109,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -126,16 +126,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -143,21 +143,21 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -165,22 +165,22 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -189,7 +189,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -197,16 +197,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -214,16 +214,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -231,16 +231,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -248,21 +248,21 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -270,16 +270,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -287,17 +287,17 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
@@ -306,7 +306,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_remote_memory(ptr
define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -314,16 +314,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -331,16 +331,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -348,16 +348,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -365,21 +365,21 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -387,22 +387,22 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -411,7 +411,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(ptr addrspace(1) %ptr, float %value) #0 {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -419,16 +419,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -436,16 +436,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -453,16 +453,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -470,21 +470,21 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -492,22 +492,22 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -516,7 +516,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -524,16 +524,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -541,16 +541,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -558,16 +558,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -575,21 +575,21 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -597,22 +597,22 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -621,7 +621,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_no_fine_grained_memo
define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -633,7 +633,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[TMP5]]
;
@@ -643,7 +643,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -651,16 +651,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -668,16 +668,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -685,26 +685,26 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -712,22 +712,22 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -736,7 +736,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -744,16 +744,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -761,16 +761,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -778,16 +778,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -795,21 +795,21 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -817,16 +817,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -834,17 +834,17 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -853,7 +853,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -861,16 +861,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -878,16 +878,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -895,26 +895,26 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -922,22 +922,22 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -946,7 +946,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, float %value) #0 {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -954,16 +954,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -971,16 +971,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -988,26 +988,26 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1015,22 +1015,22 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1039,7 +1039,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
; GFX803-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1047,16 +1047,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP5]]
;
; GFX906-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1064,16 +1064,16 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP5]]
;
; GFX908-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1081,26 +1081,26 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP5]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX942-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret float [[RES]]
;
; GFX10-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1108,22 +1108,22 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[TMP5]]
;
; GFX11-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret float [[TMP5]]
;
; GFX12-LABEL: define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[TMP5]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1136,7 +1136,7 @@ define float @test_atomicrmw_fadd_f32_global_system__amdgpu_ignore_denormal_mode
define void @test_atomicrmw_fadd_noret_f32_global_system(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1148,7 +1148,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system(ptr addrspace(1) %ptr,
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret void
;
@@ -1158,7 +1158,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system(ptr addrspace(1) %ptr,
define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1166,16 +1166,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1183,16 +1183,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1200,16 +1200,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1217,21 +1217,21 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1239,22 +1239,22 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1263,7 +1263,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1271,16 +1271,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1288,16 +1288,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1305,16 +1305,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1322,21 +1322,21 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1344,16 +1344,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1361,17 +1361,17 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
@@ -1380,7 +1380,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_remote_memor
define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1388,16 +1388,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1405,16 +1405,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1422,16 +1422,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1439,21 +1439,21 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1461,22 +1461,22 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -1485,7 +1485,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(ptr addrspace(1) %ptr, float %value) #0 {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1493,16 +1493,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1510,16 +1510,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1527,16 +1527,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1544,21 +1544,21 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1566,22 +1566,22 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -1590,7 +1590,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1598,16 +1598,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1615,16 +1615,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1632,16 +1632,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1649,21 +1649,21 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1671,22 +1671,22 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f32_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -1695,7 +1695,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_no_fine_grained
define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1707,7 +1707,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret void
;
@@ -1717,7 +1717,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1725,16 +1725,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1742,31 +1742,31 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1774,22 +1774,22 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1798,7 +1798,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1806,16 +1806,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1823,16 +1823,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1840,16 +1840,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1857,21 +1857,21 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1879,16 +1879,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1896,17 +1896,17 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1915,7 +1915,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1923,16 +1923,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1940,31 +1940,31 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1972,22 +1972,22 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1996,7 +1996,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, float %value) #0 {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2004,16 +2004,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2021,31 +2021,31 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2053,22 +2053,22 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -2077,7 +2077,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, float %value) #1 {
; GFX803-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2085,16 +2085,16 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret void
;
; GFX906-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2102,31 +2102,31 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret void
;
; GFX908-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret void
;
; GFX942-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret void
;
; GFX10-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2134,22 +2134,22 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret void
;
; GFX11-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret void
;
; GFX12-LABEL: define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret void
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -2162,7 +2162,7 @@ define void @test_atomicrmw_fadd_noret_f32_global_system__amdgpu_ignore_denormal
define float @test_atomicrmw_fsub_f32_global_system(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2174,7 +2174,7 @@ define float @test_atomicrmw_fsub_f32_global_system(ptr addrspace(1) %ptr, float
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[RES]]
;
@@ -2184,7 +2184,7 @@ define float @test_atomicrmw_fsub_f32_global_system(ptr addrspace(1) %ptr, float
define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2192,11 +2192,11 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[RES]]
;
@@ -2206,7 +2206,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memo
define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2214,11 +2214,11 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_remote_memory(ptr
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[RES]]
;
@@ -2228,7 +2228,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_remote_memory(ptr
define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2236,11 +2236,11 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[RES]]
;
@@ -2250,7 +2250,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_no_fine_grained_memo
define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2262,7 +2262,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[TMP5]]
;
@@ -2272,7 +2272,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2280,11 +2280,11 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[TMP5]]
;
@@ -2294,7 +2294,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2302,11 +2302,11 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[TMP5]]
;
@@ -2316,7 +2316,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2324,11 +2324,11 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[TMP5]]
;
@@ -2342,7 +2342,7 @@ define float @test_atomicrmw_fsub_f32_global_system__amdgpu_ignore_denormal_mode
define float @test_atomicrmw_fmax_f32_global_system(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2354,7 +2354,7 @@ define float @test_atomicrmw_fmax_f32_global_system(ptr addrspace(1) %ptr, float
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[TMP6]]
;
@@ -2364,7 +2364,7 @@ define float @test_atomicrmw_fmax_f32_global_system(ptr addrspace(1) %ptr, float
define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2372,16 +2372,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2389,16 +2389,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2406,16 +2406,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -2423,16 +2423,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -2440,27 +2440,27 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2469,7 +2469,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2477,16 +2477,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2494,16 +2494,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2511,16 +2511,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -2528,16 +2528,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -2545,16 +2545,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2562,16 +2562,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2579,17 +2579,17 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
@@ -2598,7 +2598,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr
define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2606,16 +2606,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2623,16 +2623,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2640,16 +2640,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -2657,16 +2657,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -2674,27 +2674,27 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -2703,7 +2703,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memo
define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2715,7 +2715,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[TMP6]]
;
@@ -2725,7 +2725,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2733,16 +2733,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2750,16 +2750,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2767,16 +2767,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -2784,16 +2784,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -2801,27 +2801,27 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -2830,7 +2830,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2838,16 +2838,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2855,16 +2855,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2872,16 +2872,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -2889,16 +2889,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -2906,16 +2906,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2923,16 +2923,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2940,17 +2940,17 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -2959,7 +2959,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2967,16 +2967,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2984,16 +2984,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3001,16 +3001,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -3018,16 +3018,16 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -3035,27 +3035,27 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -3068,7 +3068,7 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
define float @test_atomicrmw_fmin_f32_global_system(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -3080,7 +3080,7 @@ define float @test_atomicrmw_fmin_f32_global_system(ptr addrspace(1) %ptr, float
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[TMP6]]
;
@@ -3090,7 +3090,7 @@ define float @test_atomicrmw_fmin_f32_global_system(ptr addrspace(1) %ptr, float
define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3098,16 +3098,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3115,16 +3115,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3132,16 +3132,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -3149,16 +3149,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -3166,27 +3166,27 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -3195,7 +3195,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3203,16 +3203,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3220,16 +3220,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3237,16 +3237,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -3254,16 +3254,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -3271,16 +3271,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -3288,16 +3288,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -3305,17 +3305,17 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
@@ -3324,7 +3324,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr
define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3332,16 +3332,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3349,16 +3349,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3366,16 +3366,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -3383,16 +3383,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -3400,27 +3400,27 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -3429,7 +3429,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memo
define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -3441,7 +3441,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[TMP6]]
;
@@ -3451,7 +3451,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3459,16 +3459,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3476,16 +3476,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3493,16 +3493,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -3510,16 +3510,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -3527,27 +3527,27 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -3556,7 +3556,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3564,16 +3564,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3581,16 +3581,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3598,16 +3598,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -3615,16 +3615,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -3632,16 +3632,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -3649,16 +3649,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -3666,17 +3666,17 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -3685,7 +3685,7 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3693,16 +3693,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret float [[TMP6]]
;
; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3710,16 +3710,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret float [[TMP6]]
;
; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3727,16 +3727,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret float [[TMP6]]
;
; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -3744,16 +3744,16 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret float [[TMP6]]
;
; GFX942-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX942: atomicrmw.start:
@@ -3761,27 +3761,27 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
; GFX942-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
; GFX942-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
; GFX942-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX942: atomicrmw.end:
; GFX942-NEXT: ret float [[TMP6]]
;
; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX10-NEXT: ret float [[RES]]
;
; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX11-NEXT: ret float [[RES]]
;
; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -3793,19 +3793,35 @@ attributes #1 = { "denormal-fp-mode-f32"="dynamic,dynamic" }
!0 = !{}
;.
-; GFX803: [[META0]] = !{}
+; GFX803: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX803: [[META1]] = !{}
+; GFX803: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX906: [[META0]] = !{}
+; GFX906: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX906: [[META1]] = !{}
+; GFX906: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX908: [[META0]] = !{}
+; GFX908: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX908: [[META1]] = !{}
+; GFX908: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX90A: [[META0]] = !{}
+; GFX90A: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX90A: [[META1]] = !{}
+; GFX90A: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX942: [[META0]] = !{}
+; GFX942: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX942: [[META1]] = !{}
+; GFX942: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX10: [[META0]] = !{}
+; GFX10: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX10: [[META1]] = !{}
+; GFX10: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX11: [[META0]] = !{}
+; GFX11: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX11: [[META1]] = !{}
+; GFX11: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX12: [[META0]] = !{}
+; GFX12: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX12: [[META1]] = !{}
+; GFX12: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
index 668e7ba9b1f0f..a7174c424abff 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
@@ -15,7 +15,7 @@
; xchg is supported over PCIe, so no expansion is necessary
define double @test_atomicrmw_xchg_f64_global_agent(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8
; COMMON-NEXT: ret double [[RES]]
;
@@ -26,8 +26,8 @@ define double @test_atomicrmw_xchg_f64_global_agent(ptr addrspace(1) %ptr, doubl
; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
define double @test_atomicrmw_xchg_f64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1:![0-9]+]]
; COMMON-NEXT: ret double [[RES]]
;
%res = atomicrmw xchg ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -37,8 +37,8 @@ define double @test_atomicrmw_xchg_f64_global_agent__amdgpu_no_fine_grained_memo
; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
define double @test_atomicrmw_xchg_f64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: ret double [[RES]]
;
%res = atomicrmw xchg ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
@@ -48,8 +48,8 @@ define double @test_atomicrmw_xchg_f64_global_agent__amdgpu_no_remote_memory(ptr
; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
define double @test_atomicrmw_xchg_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: ret double [[RES]]
;
%res = atomicrmw xchg ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -62,7 +62,7 @@ define double @test_atomicrmw_xchg_f64_global_agent__amdgpu_no_fine_grained_memo
define double @test_atomicrmw_fadd_f64_global_agent(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_agent(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -74,12 +74,12 @@ define double @test_atomicrmw_fadd_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_agent(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -91,12 +91,12 @@ define double @test_atomicrmw_fadd_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_agent(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -108,12 +108,12 @@ define double @test_atomicrmw_fadd_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_agent(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -125,17 +125,17 @@ define double @test_atomicrmw_fadd_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret double [[TMP5]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_agent(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_agent(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -147,12 +147,12 @@ define double @test_atomicrmw_fadd_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_agent(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -164,12 +164,12 @@ define double @test_atomicrmw_fadd_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_agent(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -181,7 +181,7 @@ define double @test_atomicrmw_fadd_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -191,7 +191,7 @@ define double @test_atomicrmw_fadd_f64_global_agent(ptr addrspace(1) %ptr, doubl
define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -199,16 +199,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -216,16 +216,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -233,26 +233,26 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: ret double [[TMP5]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -260,16 +260,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -277,16 +277,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -294,11 +294,11 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -308,7 +308,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -316,16 +316,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -333,16 +333,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -350,16 +350,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -367,21 +367,21 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret double [[TMP5]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -389,16 +389,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -406,16 +406,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -423,11 +423,11 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -437,7 +437,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_remote_memory(ptr
define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -445,16 +445,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -462,16 +462,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -479,26 +479,26 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: ret double [[TMP5]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -506,16 +506,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -523,16 +523,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -540,11 +540,11 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -554,7 +554,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(ptr addrspace(1) %ptr, double %value) #0 {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -562,16 +562,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -579,16 +579,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -596,26 +596,26 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: ret double [[TMP5]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -623,16 +623,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -640,16 +640,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -657,11 +657,11 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -671,7 +671,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(ptr addrspace(1) %ptr, double %value) #1 {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -679,16 +679,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -696,16 +696,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -713,26 +713,26 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: ret double [[TMP5]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -740,16 +740,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -757,16 +757,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -774,11 +774,11 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -788,7 +788,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_no_fine_grained_memo
define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -800,12 +800,12 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -817,12 +817,12 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -834,12 +834,12 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -851,17 +851,17 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret double [[TMP5]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -873,12 +873,12 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -890,12 +890,12 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -907,7 +907,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -917,7 +917,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -925,16 +925,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -942,16 +942,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -959,26 +959,26 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -986,16 +986,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1003,16 +1003,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -1020,11 +1020,11 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -1034,7 +1034,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1042,16 +1042,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1059,16 +1059,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1076,16 +1076,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1093,21 +1093,21 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret double [[TMP5]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1115,16 +1115,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1132,16 +1132,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -1149,11 +1149,11 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -1163,7 +1163,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1171,16 +1171,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1188,16 +1188,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1205,26 +1205,26 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1232,16 +1232,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1249,16 +1249,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -1266,11 +1266,11 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -1280,7 +1280,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, double %value) #0 {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1288,16 +1288,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1305,16 +1305,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1322,26 +1322,26 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1349,16 +1349,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1366,16 +1366,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -1383,11 +1383,11 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -1397,7 +1397,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, double %value) #1 {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1405,16 +1405,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1422,16 +1422,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1439,26 +1439,26 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1466,16 +1466,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1483,16 +1483,16 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -1500,11 +1500,11 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -1518,7 +1518,7 @@ define double @test_atomicrmw_fadd_f64_global_agent__amdgpu_ignore_denormal_mode
define double @test_atomicrmw_fsub_f64_global_agent(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1530,7 +1530,7 @@ define double @test_atomicrmw_fsub_f64_global_agent(ptr addrspace(1) %ptr, doubl
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[RES]]
;
@@ -1540,7 +1540,7 @@ define double @test_atomicrmw_fsub_f64_global_agent(ptr addrspace(1) %ptr, doubl
define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1548,11 +1548,11 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[RES]]
;
@@ -1562,7 +1562,7 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_fine_grained_memo
define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1570,11 +1570,11 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_remote_memory(ptr
; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[RES]]
;
@@ -1584,7 +1584,7 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_remote_memory(ptr
define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1592,11 +1592,11 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_fine_grained_memo
; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[RES]]
;
@@ -1606,7 +1606,7 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_no_fine_grained_memo
define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1618,7 +1618,7 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[TMP5]]
;
@@ -1628,7 +1628,7 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode
define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1636,11 +1636,11 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[TMP5]]
;
@@ -1650,7 +1650,7 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode
define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1658,11 +1658,11 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[TMP5]]
;
@@ -1672,7 +1672,7 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode
define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1680,11 +1680,11 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode
; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[TMP5]]
;
@@ -1698,7 +1698,7 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode
define double @test_atomicrmw_fmax_f64_global_agent(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1710,12 +1710,12 @@ define double @test_atomicrmw_fmax_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1727,12 +1727,12 @@ define double @test_atomicrmw_fmax_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1744,12 +1744,12 @@ define double @test_atomicrmw_fmax_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1761,17 +1761,17 @@ define double @test_atomicrmw_fmax_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret double [[TMP6]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1783,12 +1783,12 @@ define double @test_atomicrmw_fmax_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX10-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP6]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1800,12 +1800,12 @@ define double @test_atomicrmw_fmax_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -1817,7 +1817,7 @@ define double @test_atomicrmw_fmax_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -1827,7 +1827,7 @@ define double @test_atomicrmw_fmax_f64_global_agent(ptr addrspace(1) %ptr, doubl
define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1835,16 +1835,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1852,16 +1852,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1869,31 +1869,31 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1901,16 +1901,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -1918,11 +1918,11 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -1932,7 +1932,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1940,16 +1940,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1957,16 +1957,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1974,16 +1974,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1991,21 +1991,21 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2013,16 +2013,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2030,16 +2030,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -2047,11 +2047,11 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -2061,7 +2061,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr
define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2069,16 +2069,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2086,16 +2086,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2103,31 +2103,31 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2135,16 +2135,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -2152,11 +2152,11 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -2166,7 +2166,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memo
define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2178,12 +2178,12 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2195,12 +2195,12 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2212,12 +2212,12 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -2229,17 +2229,17 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret double [[TMP6]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2251,12 +2251,12 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX10-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP6]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2268,12 +2268,12 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -2285,7 +2285,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -2295,7 +2295,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2303,16 +2303,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2320,16 +2320,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2337,31 +2337,31 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2369,16 +2369,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -2386,11 +2386,11 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -2400,7 +2400,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2408,16 +2408,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2425,16 +2425,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2442,16 +2442,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -2459,21 +2459,21 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret double [[TMP6]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2481,16 +2481,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX10-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP6]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2498,16 +2498,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -2515,11 +2515,11 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -2529,7 +2529,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2537,16 +2537,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2554,16 +2554,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2571,31 +2571,31 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2603,16 +2603,16 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -2620,11 +2620,11 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -2638,7 +2638,7 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
define double @test_atomicrmw_fmin_f64_global_agent(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2650,12 +2650,12 @@ define double @test_atomicrmw_fmin_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2667,12 +2667,12 @@ define double @test_atomicrmw_fmin_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2684,12 +2684,12 @@ define double @test_atomicrmw_fmin_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -2701,17 +2701,17 @@ define double @test_atomicrmw_fmin_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret double [[TMP6]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2723,12 +2723,12 @@ define double @test_atomicrmw_fmin_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX10-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP6]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2740,12 +2740,12 @@ define double @test_atomicrmw_fmin_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -2757,7 +2757,7 @@ define double @test_atomicrmw_fmin_f64_global_agent(ptr addrspace(1) %ptr, doubl
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -2767,7 +2767,7 @@ define double @test_atomicrmw_fmin_f64_global_agent(ptr addrspace(1) %ptr, doubl
define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2775,16 +2775,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2792,16 +2792,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2809,31 +2809,31 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2841,16 +2841,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -2858,11 +2858,11 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -2872,7 +2872,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2880,16 +2880,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2897,16 +2897,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2914,16 +2914,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -2931,21 +2931,21 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2953,16 +2953,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2970,16 +2970,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -2987,11 +2987,11 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -3001,7 +3001,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr
define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3009,16 +3009,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3026,16 +3026,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3043,31 +3043,31 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -3075,16 +3075,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -3092,11 +3092,11 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -3106,7 +3106,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memo
define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3118,12 +3118,12 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3135,12 +3135,12 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3152,12 +3152,12 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -3169,17 +3169,17 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret double [[TMP6]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -3191,12 +3191,12 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX10-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP6]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -3208,12 +3208,12 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -3225,7 +3225,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -3235,7 +3235,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3243,16 +3243,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3260,16 +3260,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3277,31 +3277,31 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -3309,16 +3309,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -3326,11 +3326,11 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -3340,7 +3340,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3348,16 +3348,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3365,16 +3365,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3382,16 +3382,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -3399,21 +3399,21 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret double [[TMP6]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -3421,16 +3421,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX10-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP6]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -3438,16 +3438,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -3455,11 +3455,11 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -3469,7 +3469,7 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -3477,16 +3477,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -3494,16 +3494,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -3511,31 +3511,31 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -3543,16 +3543,16 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -3560,11 +3560,11 @@ define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_ignore_denormal_mode
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -3577,19 +3577,35 @@ attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
!0 = !{}
;.
-; GFX803: [[META0]] = !{}
+; GFX803: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX803: [[META1]] = !{}
+; GFX803: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX906: [[META0]] = !{}
+; GFX906: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX906: [[META1]] = !{}
+; GFX906: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX908: [[META0]] = !{}
+; GFX908: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX908: [[META1]] = !{}
+; GFX908: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX90A: [[META0]] = !{}
+; GFX90A: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX90A: [[META1]] = !{}
+; GFX90A: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX942: [[META0]] = !{}
+; GFX942: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX942: [[META1]] = !{}
+; GFX942: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX10: [[META0]] = !{}
+; GFX10: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX10: [[META1]] = !{}
+; GFX10: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX11: [[META0]] = !{}
+; GFX11: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX11: [[META1]] = !{}
+; GFX11: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX12: [[META0]] = !{}
+; GFX12: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX12: [[META1]] = !{}
+; GFX12: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
index f9c123f5b9427..7b20e747a8850 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
@@ -15,7 +15,7 @@
; xchg is supported over PCIe, so no expansion is necessary
define double @test_atomicrmw_xchg_f64_global_system(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8
; COMMON-NEXT: ret double [[RES]]
;
@@ -26,8 +26,8 @@ define double @test_atomicrmw_xchg_f64_global_system(ptr addrspace(1) %ptr, doub
; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1:![0-9]+]]
; COMMON-NEXT: ret double [[RES]]
;
%res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -37,8 +37,8 @@ define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_mem
; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: ret double [[RES]]
;
%res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
@@ -48,8 +48,8 @@ define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_remote_memory(pt
; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored.
define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; COMMON-NEXT: [[RES:%.*]] = atomicrmw xchg ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: ret double [[RES]]
;
%res = atomicrmw xchg ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -62,7 +62,7 @@ define double @test_atomicrmw_xchg_f64_global_system__amdgpu_no_fine_grained_mem
define double @test_atomicrmw_fadd_f64_global_system(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -74,7 +74,7 @@ define double @test_atomicrmw_fadd_f64_global_system(ptr addrspace(1) %ptr, doub
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[TMP5]]
;
@@ -84,7 +84,7 @@ define double @test_atomicrmw_fadd_f64_global_system(ptr addrspace(1) %ptr, doub
define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -92,16 +92,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -109,16 +109,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -126,26 +126,26 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: ret double [[TMP5]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -153,16 +153,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -170,16 +170,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -187,11 +187,11 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -201,7 +201,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -209,16 +209,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -226,16 +226,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -243,16 +243,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -260,21 +260,21 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt
; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret double [[TMP5]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -282,16 +282,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -299,16 +299,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -316,11 +316,11 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -330,7 +330,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_remote_memory(pt
define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -338,16 +338,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -355,16 +355,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -372,26 +372,26 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: ret double [[TMP5]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -399,16 +399,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -416,16 +416,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -433,11 +433,11 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -447,7 +447,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(ptr addrspace(1) %ptr, double %value) #0 {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -455,16 +455,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -472,16 +472,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -489,26 +489,26 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: ret double [[TMP5]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -516,16 +516,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -533,16 +533,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -550,11 +550,11 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -564,7 +564,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(ptr addrspace(1) %ptr, double %value) #1 {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -572,16 +572,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -589,16 +589,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -606,26 +606,26 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: ret double [[TMP5]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -633,16 +633,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -650,16 +650,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_f64_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -667,11 +667,11 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -681,7 +681,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_no_fine_grained_mem
define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP3:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -693,7 +693,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED1:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[NEWLOADED]] = bitcast i64 [[NEWLOADED1]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[NEWLOADED]]
;
@@ -703,7 +703,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -711,16 +711,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -728,16 +728,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -745,26 +745,26 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -772,16 +772,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -789,16 +789,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -806,11 +806,11 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -820,7 +820,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -828,16 +828,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -845,16 +845,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -862,16 +862,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -879,21 +879,21 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX90A-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret double [[TMP5]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -901,16 +901,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -918,16 +918,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -935,11 +935,11 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -949,7 +949,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -957,16 +957,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -974,16 +974,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -991,26 +991,26 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1018,16 +1018,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1035,16 +1035,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -1052,11 +1052,11 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -1066,7 +1066,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, double %value) #0 {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1074,16 +1074,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1091,16 +1091,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1108,26 +1108,26 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1135,16 +1135,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1152,16 +1152,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -1169,11 +1169,11 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -1183,7 +1183,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, double %value) #1 {
; GFX803-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1191,16 +1191,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX803-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP5]]
;
; GFX906-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1208,16 +1208,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX906-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP5]]
;
; GFX908-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1225,26 +1225,26 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP5]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1252,16 +1252,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX10-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP5]]
;
; GFX11-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1269,16 +1269,16 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP5]]
;
; GFX12-LABEL: define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -1286,11 +1286,11 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
; GFX12-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
; GFX12-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; GFX12-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP5]]
;
@@ -1304,7 +1304,7 @@ define double @test_atomicrmw_fadd_f64_global_system__amdgpu_ignore_denormal_mod
define double @test_atomicrmw_fsub_f64_global_system(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1316,7 +1316,7 @@ define double @test_atomicrmw_fsub_f64_global_system(ptr addrspace(1) %ptr, doub
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[RES]]
;
@@ -1326,7 +1326,7 @@ define double @test_atomicrmw_fsub_f64_global_system(ptr addrspace(1) %ptr, doub
define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1334,11 +1334,11 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_mem
; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[RES]]
;
@@ -1348,7 +1348,7 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_mem
define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1356,11 +1356,11 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_remote_memory(pt
; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[RES]]
;
@@ -1370,7 +1370,7 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_remote_memory(pt
define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1378,11 +1378,11 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_mem
; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[RES]]
;
@@ -1392,7 +1392,7 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_no_fine_grained_mem
define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1404,7 +1404,7 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mod
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[TMP5]]
;
@@ -1414,7 +1414,7 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mod
define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1422,11 +1422,11 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mod
; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[TMP5]]
;
@@ -1436,7 +1436,7 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mod
define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1444,11 +1444,11 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mod
; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[TMP5]]
;
@@ -1458,7 +1458,7 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mod
define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1466,11 +1466,11 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mod
; COMMON-NEXT: [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[TMP5]]
;
@@ -1484,7 +1484,7 @@ define double @test_atomicrmw_fsub_f64_global_system__amdgpu_ignore_denormal_mod
define double @test_atomicrmw_fmax_f64_global_system(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1496,7 +1496,7 @@ define double @test_atomicrmw_fmax_f64_global_system(ptr addrspace(1) %ptr, doub
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[TMP6]]
;
@@ -1506,7 +1506,7 @@ define double @test_atomicrmw_fmax_f64_global_system(ptr addrspace(1) %ptr, doub
define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1514,16 +1514,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1531,16 +1531,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1548,31 +1548,31 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1580,16 +1580,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -1597,11 +1597,11 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -1611,7 +1611,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1619,16 +1619,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1636,16 +1636,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1653,16 +1653,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1670,21 +1670,21 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt
; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1692,16 +1692,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt
; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1709,16 +1709,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -1726,11 +1726,11 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -1740,7 +1740,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(pt
define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1748,16 +1748,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1765,16 +1765,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1782,31 +1782,31 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1814,16 +1814,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -1831,11 +1831,11 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -1845,7 +1845,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_mem
define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1857,7 +1857,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[TMP6]]
;
@@ -1867,7 +1867,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1875,16 +1875,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1892,16 +1892,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1909,31 +1909,31 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1941,16 +1941,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -1958,11 +1958,11 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -1972,7 +1972,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1980,16 +1980,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1997,16 +1997,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2014,16 +2014,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -2031,21 +2031,21 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret double [[TMP6]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2053,16 +2053,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX10-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP6]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2070,16 +2070,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -2087,11 +2087,11 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -2101,7 +2101,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2109,16 +2109,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2126,16 +2126,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2143,31 +2143,31 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2175,16 +2175,16 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -2192,11 +2192,11 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -2210,7 +2210,7 @@ define double @test_atomicrmw_fmax_f64_global_system__amdgpu_ignore_denormal_mod
define double @test_atomicrmw_fmin_f64_global_system(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2222,7 +2222,7 @@ define double @test_atomicrmw_fmin_f64_global_system(ptr addrspace(1) %ptr, doub
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[TMP6]]
;
@@ -2232,7 +2232,7 @@ define double @test_atomicrmw_fmin_f64_global_system(ptr addrspace(1) %ptr, doub
define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2240,16 +2240,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2257,16 +2257,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2274,31 +2274,31 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2306,16 +2306,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -2323,11 +2323,11 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -2337,7 +2337,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2345,16 +2345,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2362,16 +2362,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2379,16 +2379,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -2396,21 +2396,21 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt
; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2418,16 +2418,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt
; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2435,16 +2435,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -2452,11 +2452,11 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -2466,7 +2466,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(pt
define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2474,16 +2474,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2491,16 +2491,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2508,31 +2508,31 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2540,16 +2540,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -2557,11 +2557,11 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -2571,7 +2571,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_mem
define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, double %value) {
; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -2583,7 +2583,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret double [[TMP6]]
;
@@ -2593,7 +2593,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2601,16 +2601,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2618,16 +2618,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2635,31 +2635,31 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2667,16 +2667,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -2684,11 +2684,11 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -2698,7 +2698,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2706,16 +2706,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2723,16 +2723,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2740,16 +2740,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -2757,21 +2757,21 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX90A-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret double [[TMP6]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -2779,16 +2779,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX10-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret double [[TMP6]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2796,16 +2796,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -2813,11 +2813,11 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -2827,7 +2827,7 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -2835,16 +2835,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret double [[TMP6]]
;
; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -2852,16 +2852,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret double [[TMP6]]
;
; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -2869,31 +2869,31 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret double [[TMP6]]
;
; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX90A-NEXT: ret double [[RES]]
;
; GFX942-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret double [[RES]]
;
; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX10-NEXT: ret double [[RES]]
;
; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -2901,16 +2901,16 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret double [[TMP6]]
;
; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX12: atomicrmw.start:
@@ -2918,11 +2918,11 @@ define double @test_atomicrmw_fmin_f64_global_system__amdgpu_ignore_denormal_mod
; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
; GFX12: atomicrmw.end:
; GFX12-NEXT: ret double [[TMP6]]
;
@@ -2935,19 +2935,35 @@ attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
!0 = !{}
;.
-; GFX803: [[META0]] = !{}
+; GFX803: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX803: [[META1]] = !{}
+; GFX803: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX906: [[META0]] = !{}
+; GFX906: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX906: [[META1]] = !{}
+; GFX906: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX908: [[META0]] = !{}
+; GFX908: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX908: [[META1]] = !{}
+; GFX908: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX90A: [[META0]] = !{}
+; GFX90A: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX90A: [[META1]] = !{}
+; GFX90A: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX942: [[META0]] = !{}
+; GFX942: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX942: [[META1]] = !{}
+; GFX942: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX10: [[META0]] = !{}
+; GFX10: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX10: [[META1]] = !{}
+; GFX10: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX11: [[META0]] = !{}
+; GFX11: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX11: [[META1]] = !{}
+; GFX11: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX12: [[META0]] = !{}
+; GFX12: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX12: [[META1]] = !{}
+; GFX12: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-nand.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-nand.ll
index ac88ff1dd8807..22fe2d1fe1dde 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-nand.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-nand.ll
@@ -13,7 +13,7 @@ define i32 @test_atomicrmw_nand_i32_flat(ptr %ptr, i32 %value) {
; CHECK-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret i32 [[NEWLOADED]]
;
@@ -32,7 +32,7 @@ define i32 @test_atomicrmw_nand_i32_global(ptr addrspace(1) %ptr, i32 %value) {
; CHECK-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret i32 [[NEWLOADED]]
;
@@ -51,7 +51,7 @@ define i32 @test_atomicrmw_nand_i32_local(ptr addrspace(3) %ptr, i32 %value) {
; CHECK-NEXT: [[TMP3:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret i32 [[NEWLOADED]]
;
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-simplify-cfg-CAS-block.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-simplify-cfg-CAS-block.ll
index dbffa24b3cc7f..11226ebbe6adc 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-simplify-cfg-CAS-block.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-simplify-cfg-CAS-block.ll
@@ -8,7 +8,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, float %in) #0 {
; GFX90A-NEXT: entry:
; GFX90A-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; GFX90A-NEXT: [[D_CMP:%.*]] = icmp ult i32 [[TID]], 16
-; GFX90A-NEXT: br i1 [[D_CMP]], label [[IF:%.*]], label [[ELSE:%.*]]
+; GFX90A-NEXT: br i1 [[D_CMP]], label [[IF:%.*]], label [[ELSE:%.*]], !prof [[PROF1:![0-9]+]]
; GFX90A: if:
; GFX90A-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(1) [[OUT:%.*]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
@@ -21,7 +21,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, float %in) #0 {
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
; GFX90A-NEXT: [[TMP4]] = bitcast i32 [[NEWLOADED]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: br label [[ENDIF:%.*]]
; GFX90A: else:
@@ -36,7 +36,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, float %in) #0 {
; GFX90A-NEXT: [[SUCCESS5:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
; GFX90A-NEXT: [[NEWLOADED6:%.*]] = extractvalue { i32, i1 } [[TMP8]], 0
; GFX90A-NEXT: [[TMP9]] = bitcast i32 [[NEWLOADED6]] to float
-; GFX90A-NEXT: br i1 [[SUCCESS5]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START2]]
+; GFX90A-NEXT: br i1 [[SUCCESS5]], label [[ATOMICRMW_END1:%.*]], label [[ATOMICRMW_START2]], !prof [[PROF2]]
; GFX90A: atomicrmw.end1:
; GFX90A-NEXT: br label [[ENDIF]]
; GFX90A: endif:
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll
index fdb4c592a6130..d1c0f5dd713d9 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll
@@ -42,7 +42,7 @@
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -54,12 +54,12 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %p
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -71,12 +71,12 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %p
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -88,12 +88,12 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %p
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -105,17 +105,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %p
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -127,12 +127,12 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %p
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -144,12 +144,12 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %p
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
@@ -159,7 +159,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %p
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -167,16 +167,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -184,16 +184,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -201,16 +201,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -218,21 +218,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1:![0-9]+]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -240,16 +240,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -257,17 +257,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1:![0-9]+]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -276,7 +276,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -284,16 +284,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -301,16 +301,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -318,16 +318,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -335,21 +335,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -357,16 +357,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -374,17 +374,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
@@ -393,7 +393,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_m
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -401,16 +401,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -418,16 +418,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -435,16 +435,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -452,21 +452,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -474,16 +474,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -491,17 +491,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -510,7 +510,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -518,16 +518,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -535,16 +535,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -552,16 +552,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -569,21 +569,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -591,16 +591,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -608,17 +608,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -627,7 +627,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -635,16 +635,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -652,16 +652,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -669,16 +669,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -686,21 +686,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -708,16 +708,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -725,17 +725,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -744,7 +744,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_gra
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -756,12 +756,12 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -773,12 +773,12 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -790,12 +790,12 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -807,17 +807,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -829,12 +829,12 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -846,13 +846,13 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
@@ -861,7 +861,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -869,16 +869,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -886,16 +886,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -903,16 +903,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -920,21 +920,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -942,16 +942,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -959,17 +959,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -978,7 +978,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -986,16 +986,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1003,16 +1003,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1020,16 +1020,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1037,21 +1037,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1059,16 +1059,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1076,17 +1076,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1095,7 +1095,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1103,16 +1103,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1120,16 +1120,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1137,16 +1137,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1154,21 +1154,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1176,16 +1176,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1193,17 +1193,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1212,7 +1212,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1220,16 +1220,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1237,16 +1237,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1254,16 +1254,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1271,21 +1271,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1293,16 +1293,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1310,17 +1310,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1329,7 +1329,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1337,16 +1337,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1354,16 +1354,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1371,16 +1371,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1388,21 +1388,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1410,16 +1410,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1427,17 +1427,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1449,176 +1449,1128 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno
;---------------------------------------------------------------------
define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[RES]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX942-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX942-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX12-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP5]]
;
%res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[RES]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX942-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX942-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX12-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP5]]
;
%res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[RES]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX942-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX942-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX12-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP5]]
;
%res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[RES]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
- %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
- ret <2 x bfloat> %res
-}
-
-define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
- %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
- ret <2 x bfloat> %res
-}
-
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX942-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX942-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX12-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+ %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX942-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX942-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX12-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+ %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+ ret <2 x bfloat> %res
+}
+
define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX942-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX942-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX12-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP5]]
;
%res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX942-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX942-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX12-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+ %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+ ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX942-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX942-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
- %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
- ret <2 x bfloat> %res
-}
-
-define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX12-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP5]]
;
%res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
@@ -1629,176 +2581,1128 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent__amdgpu_ignore_deno
;---------------------------------------------------------------------
define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[RES]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[RES]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[RES]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[RES]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP6]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP6]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP6]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+ %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+ ret <2 x bfloat> %res
+}
+
+define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP6]]
;
- %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
- ret <2 x bfloat> %res
-}
-
-define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP6]]
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
@@ -1809,176 +3713,1128 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent__amdgpu_ignore_deno
;---------------------------------------------------------------------
define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[RES]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[RES]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[RES]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[RES]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP6]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP6]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP6]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
}
define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
-; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x bfloat> [[TMP6]]
+; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x bfloat> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x bfloat> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x bfloat> %res
@@ -1989,19 +4845,37 @@ attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
!0 = !{}
;.
-; GFX803: [[META0]] = !{}
+; GFX803: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX803: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX803: [[META2]] = !{}
;.
-; GFX906: [[META0]] = !{}
+; GFX906: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX906: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX906: [[META2]] = !{}
;.
-; GFX908: [[META0]] = !{}
+; GFX908: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX908: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX908: [[META2]] = !{}
;.
-; GFX90A: [[META0]] = !{}
+; GFX90A: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX90A: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX90A: [[META2]] = !{}
;.
-; GFX942: [[META0]] = !{}
+; GFX942: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX942: [[META1]] = !{}
+; GFX942: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX10: [[META0]] = !{}
+; GFX10: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX10: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX10: [[META2]] = !{}
;.
-; GFX11: [[META0]] = !{}
+; GFX11: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX11: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX11: [[META2]] = !{}
;.
-; GFX12: [[META0]] = !{}
+; GFX12: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX12: [[META1]] = !{}
+; GFX12: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; COMMON: {{.*}}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll
index 2401418807788..72e4411b15207 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll
@@ -42,7 +42,7 @@
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -54,7 +54,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system(ptr addrspace(1) %
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
;
@@ -64,7 +64,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system(ptr addrspace(1) %
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -72,16 +72,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -89,16 +89,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -106,16 +106,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -123,21 +123,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -145,16 +145,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -162,17 +162,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -181,7 +181,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -189,16 +189,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -206,16 +206,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -223,16 +223,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -240,21 +240,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -262,16 +262,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -279,17 +279,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory !0
@@ -298,7 +298,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -306,16 +306,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -323,16 +323,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -340,16 +340,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -357,21 +357,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -379,16 +379,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -396,17 +396,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -415,7 +415,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -423,16 +423,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -440,16 +440,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -457,16 +457,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -474,21 +474,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -496,16 +496,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -513,17 +513,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -532,7 +532,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -540,16 +540,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -557,16 +557,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -574,16 +574,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -591,21 +591,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -613,16 +613,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -630,17 +630,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -649,7 +649,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_gr
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -661,7 +661,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
;
@@ -671,7 +671,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -679,16 +679,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -696,16 +696,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -713,16 +713,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -730,21 +730,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -752,16 +752,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -769,17 +769,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -788,7 +788,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -796,16 +796,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -813,16 +813,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -830,16 +830,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -847,21 +847,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -869,16 +869,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -886,17 +886,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -905,7 +905,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -913,16 +913,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -930,16 +930,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -947,16 +947,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -964,21 +964,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -986,16 +986,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1003,17 +1003,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1022,7 +1022,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1030,16 +1030,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1047,16 +1047,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1064,16 +1064,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1081,21 +1081,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1103,16 +1103,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1120,17 +1120,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1139,7 +1139,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 {
; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1147,16 +1147,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1164,16 +1164,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1181,16 +1181,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -1198,21 +1198,21 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1220,16 +1220,16 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1237,17 +1237,17 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX12-NEXT: ret <2 x bfloat> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1260,7 +1260,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den
define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1272,7 +1272,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system(ptr addrspace(1) %
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[RES]]
;
@@ -1282,7 +1282,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system(ptr addrspace(1) %
define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1290,11 +1290,11 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_gr
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[RES]]
;
@@ -1304,7 +1304,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_gr
define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1312,11 +1312,11 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_remote_
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[RES]]
;
@@ -1326,7 +1326,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_remote_
define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1334,11 +1334,11 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_gr
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[RES]]
;
@@ -1348,7 +1348,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_no_fine_gr
define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1360,7 +1360,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
;
@@ -1370,7 +1370,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1378,11 +1378,11 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
;
@@ -1392,7 +1392,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1400,11 +1400,11 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
;
@@ -1414,7 +1414,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1422,11 +1422,11 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x bfloat> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[TMP5]]
;
@@ -1440,7 +1440,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system__amdgpu_ignore_den
define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1452,7 +1452,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system(ptr addrspace(1) %
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[RES]]
;
@@ -1462,7 +1462,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system(ptr addrspace(1) %
define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1470,11 +1470,11 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_gr
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[RES]]
;
@@ -1484,7 +1484,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_gr
define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1492,11 +1492,11 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_remote_
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[RES]]
;
@@ -1506,7 +1506,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_remote_
define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1514,11 +1514,11 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_gr
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[RES]]
;
@@ -1528,7 +1528,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_no_fine_gr
define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1540,7 +1540,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[TMP6]]
;
@@ -1550,7 +1550,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1558,11 +1558,11 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[TMP6]]
;
@@ -1572,7 +1572,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1580,11 +1580,11 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[TMP6]]
;
@@ -1594,7 +1594,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1602,11 +1602,11 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[TMP6]]
;
@@ -1620,7 +1620,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_system__amdgpu_ignore_den
define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1632,7 +1632,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system(ptr addrspace(1) %
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[RES]]
;
@@ -1642,7 +1642,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system(ptr addrspace(1) %
define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1650,11 +1650,11 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_gr
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[RES]]
;
@@ -1664,7 +1664,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_gr
define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1672,11 +1672,11 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_remote_
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[RES]]
;
@@ -1686,7 +1686,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_remote_
define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1694,11 +1694,11 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_gr
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[RES]]
;
@@ -1708,7 +1708,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_no_fine_gr
define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1720,7 +1720,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[TMP6]]
;
@@ -1730,7 +1730,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den
define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1738,11 +1738,11 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[TMP6]]
;
@@ -1752,7 +1752,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den
define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1760,11 +1760,11 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[TMP6]]
;
@@ -1774,7 +1774,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den
define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1782,11 +1782,11 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_system__amdgpu_ignore_den
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> [[LOADED]], <2 x bfloat> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x bfloat> [[TMP6]]
;
@@ -1799,19 +1799,35 @@ attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
!0 = !{}
;.
-; GFX803: [[META0]] = !{}
+; GFX803: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX803: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX803: [[META2]] = !{}
;.
-; GFX906: [[META0]] = !{}
+; GFX906: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX906: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX906: [[META2]] = !{}
;.
-; GFX908: [[META0]] = !{}
+; GFX908: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX908: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX908: [[META2]] = !{}
;.
-; GFX90A: [[META0]] = !{}
+; GFX90A: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX90A: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX90A: [[META2]] = !{}
;.
-; GFX942: [[META0]] = !{}
+; GFX942: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX942: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX942: [[META2]] = !{}
;.
-; GFX10: [[META0]] = !{}
+; GFX10: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX10: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX10: [[META2]] = !{}
;.
-; GFX11: [[META0]] = !{}
+; GFX11: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX11: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX11: [[META2]] = !{}
;.
-; GFX12: [[META0]] = !{}
+; GFX12: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX12: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX12: [[META2]] = !{}
;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll
index 59433db38443e..81fc8fcacdda5 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll
@@ -42,7 +42,7 @@
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -54,12 +54,12 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr,
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -71,12 +71,12 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr,
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -88,12 +88,12 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr,
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -105,17 +105,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr,
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -127,12 +127,12 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr,
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -144,12 +144,12 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr,
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4
; GFX12-NEXT: ret <2 x half> [[RES]]
;
@@ -159,7 +159,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr,
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -167,16 +167,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -184,16 +184,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -201,26 +201,26 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1:![0-9]+]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -228,16 +228,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -245,17 +245,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1:![0-9]+]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -264,7 +264,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -272,16 +272,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -289,16 +289,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -306,16 +306,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -323,21 +323,21 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -345,16 +345,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -362,17 +362,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
@@ -381,7 +381,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memo
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -389,16 +389,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -406,16 +406,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -423,26 +423,26 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -450,16 +450,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -467,17 +467,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -486,7 +486,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -494,16 +494,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -511,16 +511,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -528,26 +528,26 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -555,16 +555,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -572,17 +572,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -591,7 +591,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -599,16 +599,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -616,16 +616,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -633,26 +633,26 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -660,16 +660,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -677,17 +677,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -696,7 +696,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_graine
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -708,12 +708,12 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -725,12 +725,12 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -742,12 +742,12 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -759,17 +759,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -781,12 +781,12 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -798,13 +798,13 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
@@ -813,7 +813,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -821,16 +821,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -838,16 +838,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -855,26 +855,26 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -882,16 +882,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -899,17 +899,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -918,7 +918,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -926,16 +926,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -943,16 +943,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -960,16 +960,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -977,21 +977,21 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -999,16 +999,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1016,17 +1016,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1035,7 +1035,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1043,16 +1043,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1060,16 +1060,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1077,26 +1077,26 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1104,16 +1104,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1121,17 +1121,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1140,7 +1140,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1148,16 +1148,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1165,16 +1165,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1182,26 +1182,26 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1209,16 +1209,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1226,17 +1226,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1245,7 +1245,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1253,16 +1253,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1270,16 +1270,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1287,26 +1287,26 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1314,16 +1314,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1331,17 +1331,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]], !amdgpu.ignore.denormal.mode [[META1]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1353,176 +1353,1128 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma
;---------------------------------------------------------------------
define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[RES]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX942-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX942-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX12-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[RES]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX942-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX942-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX12-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[RES]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX942-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX942-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX12-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[RES]]
-;
- %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
- ret <2 x half> %res
-}
-
-define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
- %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
- ret <2 x half> %res
-}
-
-define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
- %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
- ret <2 x half> %res
-}
-
-define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX942-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX942-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX12-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP5]]
+;
+ %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX942-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX942-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX12-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP5]]
+;
+ %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
+ ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX942-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX942-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX12-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP5]]
+;
+ %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+ ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX942-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX942-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX12-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP5]]
+;
+ %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+ ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX942-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX942-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP5]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
- %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
- ret <2 x half> %res
-}
-
-define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
-; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP5]]
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
+; GFX12-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX12-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP5]]
;
%res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
@@ -1533,176 +2485,1128 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denorma
;---------------------------------------------------------------------
define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[RES]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[RES]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[RES]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[RES]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP6]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP6]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP6]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP6]]
+;
+ %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+ ret <2 x half> %res
+}
+
+define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP6]]
;
- %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
- ret <2 x half> %res
-}
-
-define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP6]]
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
@@ -1713,176 +3617,1128 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denorma
;---------------------------------------------------------------------
define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[RES]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[RES]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[RES]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[RES]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP6]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP6]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP6]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
}
define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
-; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret <2 x half> [[TMP6]]
+; GFX803-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX906-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX908-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX942-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX942-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX942: atomicrmw.start:
+; GFX942-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX942-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX942-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX942-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX942-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX942-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX942-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX942-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX942-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX942: atomicrmw.end:
+; GFX942-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX10-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX11-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret <2 x half> [[TMP6]]
+;
+; GFX12-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META1]], !amdgpu.no.remote.memory [[META1]]
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret <2 x half> [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret <2 x half> %res
@@ -1893,19 +4749,37 @@ attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
!0 = !{}
;.
-; GFX803: [[META0]] = !{}
+; GFX803: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX803: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX803: [[META2]] = !{}
;.
-; GFX906: [[META0]] = !{}
+; GFX906: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX906: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX906: [[META2]] = !{}
;.
-; GFX908: [[META0]] = !{}
+; GFX908: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX908: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX908: [[META2]] = !{}
;.
-; GFX90A: [[META0]] = !{}
+; GFX90A: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX90A: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX90A: [[META2]] = !{}
;.
-; GFX942: [[META0]] = !{}
+; GFX942: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX942: [[META1]] = !{}
+; GFX942: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
-; GFX10: [[META0]] = !{}
+; GFX10: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX10: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX10: [[META2]] = !{}
;.
-; GFX11: [[META0]] = !{}
+; GFX11: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX11: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX11: [[META2]] = !{}
;.
-; GFX12: [[META0]] = !{}
+; GFX12: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX12: [[META1]] = !{}
+; GFX12: [[PROF2]] = !{!"unknown", !"atomic-expand"}
;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; COMMON: {{.*}}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll
index 148261cd0a678..4a84e874e4dbe 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll
@@ -42,7 +42,7 @@
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -54,7 +54,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(ptr addrspace(1) %ptr
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
@@ -64,7 +64,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(ptr addrspace(1) %ptr
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -72,16 +72,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -89,16 +89,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -106,26 +106,26 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -133,16 +133,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -150,17 +150,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -169,7 +169,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -177,16 +177,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -194,16 +194,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -211,16 +211,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -228,21 +228,21 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -250,16 +250,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -267,17 +267,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory !0
@@ -286,7 +286,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_mem
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -294,16 +294,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -311,16 +311,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -328,26 +328,26 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -355,16 +355,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -372,17 +372,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -391,7 +391,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -399,16 +399,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -416,16 +416,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -433,26 +433,26 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -460,16 +460,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -477,17 +477,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -496,7 +496,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -504,16 +504,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -521,16 +521,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -538,26 +538,26 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -565,16 +565,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -582,17 +582,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
@@ -601,7 +601,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grain
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -613,7 +613,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
@@ -623,7 +623,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -631,16 +631,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -648,16 +648,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -665,26 +665,26 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -692,16 +692,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -709,17 +709,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -728,7 +728,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -736,16 +736,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -753,16 +753,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -770,16 +770,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -787,21 +787,21 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -809,16 +809,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -826,17 +826,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -845,7 +845,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -853,16 +853,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -870,16 +870,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -887,26 +887,26 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -914,16 +914,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -931,17 +931,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -950,7 +950,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -958,16 +958,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -975,16 +975,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -992,26 +992,26 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1019,16 +1019,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1036,17 +1036,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1055,7 +1055,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 {
; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX803: atomicrmw.start:
@@ -1063,16 +1063,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX803: atomicrmw.end:
; GFX803-NEXT: ret <2 x half> [[TMP5]]
;
; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX906: atomicrmw.start:
@@ -1080,16 +1080,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX906: atomicrmw.end:
; GFX906-NEXT: ret <2 x half> [[TMP5]]
;
; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX908: atomicrmw.start:
@@ -1097,26 +1097,26 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX908: atomicrmw.end:
; GFX908-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX942-NEXT: ret <2 x half> [[RES]]
;
; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX10: atomicrmw.start:
@@ -1124,16 +1124,16 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX10: atomicrmw.end:
; GFX10-NEXT: ret <2 x half> [[TMP5]]
;
; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX11: atomicrmw.start:
@@ -1141,17 +1141,17 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX11: atomicrmw.end:
; GFX11-NEXT: ret <2 x half> [[TMP5]]
;
; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(
-; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] {
-; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] !prof [[PROF0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]], !amdgpu.ignore.denormal.mode [[META2]]
; GFX12-NEXT: ret <2 x half> [[RES]]
;
%res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
@@ -1164,7 +1164,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm
define <2 x half> @test_atomicrmw_fsub_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1176,7 +1176,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system(ptr addrspace(1) %ptr
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
@@ -1186,7 +1186,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system(ptr addrspace(1) %ptr
define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1194,11 +1194,11 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grain
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
@@ -1208,7 +1208,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grain
define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1216,11 +1216,11 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_remote_mem
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
@@ -1230,7 +1230,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_remote_mem
define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1238,11 +1238,11 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grain
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
@@ -1252,7 +1252,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_no_fine_grain
define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1264,7 +1264,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
@@ -1274,7 +1274,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1282,11 +1282,11 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
@@ -1296,7 +1296,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1304,11 +1304,11 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
@@ -1318,7 +1318,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1326,11 +1326,11 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]]
; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP5]]
;
@@ -1344,7 +1344,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_system__amdgpu_ignore_denorm
define <2 x half> @test_atomicrmw_fmax_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1356,7 +1356,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system(ptr addrspace(1) %ptr
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
@@ -1366,7 +1366,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system(ptr addrspace(1) %ptr
define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1374,11 +1374,11 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grain
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
@@ -1388,7 +1388,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grain
define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1396,11 +1396,11 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_remote_mem
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
@@ -1410,7 +1410,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_remote_mem
define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1418,11 +1418,11 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grain
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
@@ -1432,7 +1432,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_no_fine_grain
define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1444,7 +1444,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP6]]
;
@@ -1454,7 +1454,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1462,11 +1462,11 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP6]]
;
@@ -1476,7 +1476,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1484,11 +1484,11 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP6]]
;
@@ -1498,7 +1498,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1506,11 +1506,11 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP6]]
;
@@ -1524,7 +1524,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_system__amdgpu_ignore_denorm
define <2 x half> @test_atomicrmw_fmin_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1536,7 +1536,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system(ptr addrspace(1) %ptr
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
@@ -1546,7 +1546,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system(ptr addrspace(1) %ptr
define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1554,11 +1554,11 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grain
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
@@ -1568,7 +1568,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grain
define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1576,11 +1576,11 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_remote_mem
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
@@ -1590,7 +1590,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_remote_mem
define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1598,11 +1598,11 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grain
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[RES]]
;
@@ -1612,7 +1612,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_no_fine_grain
define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1624,7 +1624,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP6]]
;
@@ -1634,7 +1634,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm
define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1642,11 +1642,11 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP6]]
;
@@ -1656,7 +1656,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm
define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1664,11 +1664,11 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP6]]
;
@@ -1678,7 +1678,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm
define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) {
; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
; COMMON: atomicrmw.start:
@@ -1686,11 +1686,11 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_system__amdgpu_ignore_denorm
; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]])
; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32
; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]]
; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret <2 x half> [[TMP6]]
;
@@ -1703,19 +1703,35 @@ attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" }
!0 = !{}
;.
-; GFX803: [[META0]] = !{}
+; GFX803: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX803: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX803: [[META2]] = !{}
;.
-; GFX906: [[META0]] = !{}
+; GFX906: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX906: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX906: [[META2]] = !{}
;.
-; GFX908: [[META0]] = !{}
+; GFX908: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX908: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX908: [[META2]] = !{}
;.
-; GFX90A: [[META0]] = !{}
+; GFX90A: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX90A: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX90A: [[META2]] = !{}
;.
-; GFX942: [[META0]] = !{}
+; GFX942: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX942: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX942: [[META2]] = !{}
;.
-; GFX10: [[META0]] = !{}
+; GFX10: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX10: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX10: [[META2]] = !{}
;.
-; GFX11: [[META0]] = !{}
+; GFX11: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX11: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX11: [[META2]] = !{}
;.
-; GFX12: [[META0]] = !{}
+; GFX12: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX12: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+; GFX12: [[META2]] = !{}
;.
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll
index c388877f9c284..023029d7938bc 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll
@@ -9,7 +9,7 @@
define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) {
; GFX900-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(
-; GFX900-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX900-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX900-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX900-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX900: atomicrmw.start:
@@ -21,12 +21,12 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(ptr addrspace(1
; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX900-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX900-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; GFX900: atomicrmw.end:
; GFX900-NEXT: ret <2 x half> [[TMP5]]
;
; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -38,12 +38,12 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(ptr addrspace(1
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x half> [[RES]]
;
; GFX942-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4
; GFX942-NEXT: ret <2 x half> [[RES]]
;
@@ -53,7 +53,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(ptr addrspace(1
define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; GFX900-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(
-; GFX900-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX900-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX900-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX900-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX900: atomicrmw.start:
@@ -65,12 +65,12 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(ptr addrspac
; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX900-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX900-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX900: atomicrmw.end:
; GFX900-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(
-; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX90A: atomicrmw.start:
@@ -82,12 +82,12 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(ptr addrspac
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; GFX90A: atomicrmw.end:
; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]]
;
; GFX942-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(
-; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX942-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; GFX942-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4
; GFX942-NEXT: ret <2 x bfloat> [[RES]]
;
@@ -97,7 +97,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(ptr addrspac
define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align8(ptr addrspace(1) %ptr, <4 x half> %value) {
; CHECK-LABEL: define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align8(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 8
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
@@ -109,7 +109,7 @@ define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align8(ptr addrspace(1
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to <4 x half>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <4 x half> [[TMP5]]
;
@@ -119,7 +119,7 @@ define <4 x half> @test_atomicrmw_fadd_v4f16_global_agent_align8(ptr addrspace(1
define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align8(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align8(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 8
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
@@ -131,7 +131,7 @@ define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align8(ptr addrspac
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to <4 x bfloat>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <4 x bfloat> [[TMP5]]
;
@@ -141,7 +141,7 @@ define <4 x bfloat> @test_atomicrmw_fadd_v4bf16_global_agent_align8(ptr addrspac
define <2 x float> @test_atomicrmw_fadd_v2f32_global_agent_align8(ptr addrspace(1) %ptr, <2 x float> %value) {
; CHECK-LABEL: define <2 x float> @test_atomicrmw_fadd_v2f32_global_agent_align8(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR]], align 8
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
@@ -153,7 +153,7 @@ define <2 x float> @test_atomicrmw_fadd_v2f32_global_agent_align8(ptr addrspace(
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to <2 x float>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <2 x float> [[TMP5]]
;
@@ -167,7 +167,7 @@ define <2 x float> @test_atomicrmw_fadd_v2f32_global_agent_align8(ptr addrspace(
define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) {
; CHECK-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent_align4(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
@@ -179,7 +179,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent_align4(ptr addrspace(1
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <2 x half> [[TMP5]]
;
@@ -189,7 +189,7 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent_align4(ptr addrspace(1
define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent_align4(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
@@ -201,7 +201,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent_align4(ptr addrspac
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <2 x bfloat> [[TMP5]]
;
@@ -211,7 +211,7 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent_align4(ptr addrspac
define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align8(ptr addrspace(1) %ptr, <4 x half> %value) {
; CHECK-LABEL: define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align8(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 8
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
@@ -223,7 +223,7 @@ define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align8(ptr addrspace(1
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to <4 x half>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <4 x half> [[TMP5]]
;
@@ -233,7 +233,7 @@ define <4 x half> @test_atomicrmw_fsub_v4f16_global_agent_align8(ptr addrspace(1
define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align8(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align8(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 8
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
@@ -245,7 +245,7 @@ define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align8(ptr addrspac
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to <4 x bfloat>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <4 x bfloat> [[TMP5]]
;
@@ -255,7 +255,7 @@ define <4 x bfloat> @test_atomicrmw_fsub_v4bf16_global_agent_align8(ptr addrspac
define <2 x float> @test_atomicrmw_fsub_v2f32_global_agent_align8(ptr addrspace(1) %ptr, <2 x float> %value) {
; CHECK-LABEL: define <2 x float> @test_atomicrmw_fsub_v2f32_global_agent_align8(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR]], align 8
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
@@ -267,7 +267,7 @@ define <2 x float> @test_atomicrmw_fsub_v2f32_global_agent_align8(ptr addrspace(
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to <2 x float>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <2 x float> [[TMP5]]
;
@@ -281,7 +281,7 @@ define <2 x float> @test_atomicrmw_fsub_v2f32_global_agent_align8(ptr addrspace(
define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) {
; CHECK-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent_align4(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
@@ -293,7 +293,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent_align4(ptr addrspace(1
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <2 x half> [[TMP6]]
;
@@ -303,7 +303,7 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent_align4(ptr addrspace(1
define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent_align4(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
@@ -315,7 +315,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent_align4(ptr addrspac
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <2 x bfloat> [[TMP6]]
;
@@ -325,7 +325,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_global_agent_align4(ptr addrspac
define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align8(ptr addrspace(1) %ptr, <4 x half> %value) {
; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align8(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 8
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
@@ -337,7 +337,7 @@ define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align8(ptr addrspace(1
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to <4 x half>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <4 x half> [[TMP6]]
;
@@ -347,7 +347,7 @@ define <4 x half> @test_atomicrmw_fmin_v4f16_global_agent_align8(ptr addrspace(1
define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align8(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align8(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 8
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
@@ -359,7 +359,7 @@ define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align8(ptr addrspac
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to <4 x bfloat>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <4 x bfloat> [[TMP6]]
;
@@ -369,7 +369,7 @@ define <4 x bfloat> @test_atomicrmw_fmin_v4bf16_global_agent_align8(ptr addrspac
define <2 x float> @test_atomicrmw_fmin_v2f32_global_agent_align8(ptr addrspace(1) %ptr, <2 x float> %value) {
; CHECK-LABEL: define <2 x float> @test_atomicrmw_fmin_v2f32_global_agent_align8(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR]], align 8
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
@@ -381,7 +381,7 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_global_agent_align8(ptr addrspace(
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to <2 x float>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <2 x float> [[TMP6]]
;
@@ -395,7 +395,7 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_global_agent_align8(ptr addrspace(
define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) {
; CHECK-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent_align4(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
@@ -407,7 +407,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent_align4(ptr addrspace(1
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <2 x half> [[TMP6]]
;
@@ -417,7 +417,7 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent_align4(ptr addrspace(1
define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) {
; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent_align4(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
@@ -429,7 +429,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent_align4(ptr addrspac
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <2 x bfloat> [[TMP6]]
;
@@ -439,7 +439,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_global_agent_align4(ptr addrspac
define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align8(ptr addrspace(1) %ptr, <4 x half> %value) {
; CHECK-LABEL: define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align8(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x half> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr addrspace(1) [[PTR]], align 8
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
@@ -451,7 +451,7 @@ define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align8(ptr addrspace(1
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to <4 x half>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <4 x half> [[TMP6]]
;
@@ -461,7 +461,7 @@ define <4 x half> @test_atomicrmw_fmax_v4f16_global_agent_align8(ptr addrspace(1
define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align8(ptr addrspace(1) %ptr, <4 x bfloat> %value) {
; CHECK-LABEL: define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align8(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(1) [[PTR]], align 8
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
@@ -473,7 +473,7 @@ define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align8(ptr addrspac
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to <4 x bfloat>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <4 x bfloat> [[TMP6]]
;
@@ -483,7 +483,7 @@ define <4 x bfloat> @test_atomicrmw_fmax_v4bf16_global_agent_align8(ptr addrspac
define <2 x float> @test_atomicrmw_fmax_v2f32_global_agent_align8(ptr addrspace(1) %ptr, <2 x float> %value) {
; CHECK-LABEL: define <2 x float> @test_atomicrmw_fmax_v2f32_global_agent_align8(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] !prof [[PROF0]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR]], align 8
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
@@ -495,10 +495,20 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_global_agent_align8(ptr addrspace(
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to <2 x float>
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret <2 x float> [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x float> %value syncscope("agent") seq_cst, align 8
ret <2 x float> %res
}
+;.
+; GFX900: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX900: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+;.
+; GFX90A: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX90A: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+;.
+; GFX942: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; GFX942: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+;.
diff --git a/llvm/test/Transforms/AtomicExpand/ARM/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/ARM/atomicrmw-fp.ll
index cc4104df5d725..1f0f7d6a754cf 100644
--- a/llvm/test/Transforms/AtomicExpand/ARM/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/ARM/atomicrmw-fp.ll
@@ -15,7 +15,7 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: call void @llvm.arm.dmb(i32 11)
; CHECK-NEXT: ret float [[TMP5]]
@@ -38,7 +38,7 @@ define float @test_atomicrmw_fsub_f32(ptr %ptr, float %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: call void @llvm.arm.dmb(i32 11)
; CHECK-NEXT: ret float [[TMP5]]
diff --git a/llvm/test/Transforms/AtomicExpand/LoongArch/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/LoongArch/atomicrmw-fp.ll
index 5ce831a4c4c83..93d2d13703fe0 100644
--- a/llvm/test/Transforms/AtomicExpand/LoongArch/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/LoongArch/atomicrmw-fp.ll
@@ -14,7 +14,7 @@ define float @atomicrmw_fadd_float(ptr %ptr, float %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret float [[TMP5]]
;
@@ -35,7 +35,7 @@ define float @atomicrmw_fsub_float(ptr %ptr, float %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret float [[TMP5]]
;
@@ -56,7 +56,7 @@ define float @atomicrmw_fmin_float(ptr %ptr, float %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret float [[TMP6]]
;
@@ -77,7 +77,7 @@ define float @atomicrmw_fmax_float(ptr %ptr, float %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret float [[TMP6]]
;
@@ -98,7 +98,7 @@ define double @atomicrmw_fadd_double(ptr %ptr, double %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret double [[TMP5]]
;
@@ -119,7 +119,7 @@ define double @atomicrmw_fsub_double(ptr %ptr, double %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret double [[TMP5]]
;
@@ -140,7 +140,7 @@ define double @atomicrmw_fmin_double(ptr %ptr, double %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret double [[TMP6]]
;
@@ -161,7 +161,7 @@ define double @atomicrmw_fmax_double(ptr %ptr, double %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
; CHECK-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret double [[TMP6]]
;
diff --git a/llvm/test/Transforms/AtomicExpand/Mips/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/Mips/atomicrmw-fp.ll
index 3244683916bc0..7f707bd5ae367 100644
--- a/llvm/test/Transforms/AtomicExpand/Mips/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/Mips/atomicrmw-fp.ll
@@ -15,7 +15,7 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: fence seq_cst
; CHECK-NEXT: ret float [[TMP5]]
@@ -38,7 +38,7 @@ define float @test_atomicrmw_fsub_f32(ptr %ptr, float %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: fence seq_cst
; CHECK-NEXT: ret float [[TMP5]]
diff --git a/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
index 840e2d3eee553..692cac401d2ae 100644
--- a/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
@@ -7,51 +7,42 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
-; CHECK-NEXT: %loaded = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %cmpxchg.end ]
-; CHECK-NEXT: %new = fadd float %loaded, %value
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast float %new to i32
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float %loaded to i32
-; CHECK-NEXT: br label %cmpxchg.start
-; CHECK-EMPTY:
-; CHECK-NEXT: cmpxchg.start: ; preds = %cmpxchg.trystore, %atomicrmw.start
-; CHECK-NEXT: %larx = call i32 @llvm.ppc.lwarx(ptr %ptr)
-; CHECK-NEXT: %should_store = icmp eq i32 %larx, [[TMP3]]
-; CHECK-NEXT: br i1 %should_store, label %cmpxchg.fencedstore, label %cmpxchg.nostore
-; CHECK-EMPTY:
-; CHECK-NEXT: cmpxchg.fencedstore: ; preds = %cmpxchg.start
-; CHECK-NEXT: br label %cmpxchg.trystore
-; CHECK-EMPTY:
-; CHECK-NEXT: cmpxchg.trystore: ; preds = %cmpxchg.fencedstore
-; CHECK-NEXT: %loaded.trystore = phi i32 [ %larx, %cmpxchg.fencedstore ]
-; CHECK-NEXT: %stcx = call i32 @llvm.ppc.stwcx(ptr %ptr, i32 [[TMP2]])
-; CHECK-NEXT: [[TMP4:%.*]] = xor i32 %stcx, 1
-; CHECK-NEXT: %success1 = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT: br i1 %success1, label %cmpxchg.success, label %cmpxchg.start
-; CHECK-EMPTY:
-; CHECK-NEXT: cmpxchg.releasedload: ; No predecessors!
+; CHECK-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[CMPXCHG_END:%.*]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; CHECK-NEXT: br label [[CMPXCHG_START:%.*]]
+; CHECK: cmpxchg.start:
+; CHECK-NEXT: [[LARX:%.*]] = call i32 @llvm.ppc.lwarx(ptr [[PTR]])
+; CHECK-NEXT: [[SHOULD_STORE:%.*]] = icmp eq i32 [[LARX]], [[TMP3]]
+; CHECK-NEXT: br i1 [[SHOULD_STORE]], label [[CMPXCHG_FENCEDSTORE:%.*]], label [[CMPXCHG_NOSTORE:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK: cmpxchg.fencedstore:
+; CHECK-NEXT: br label [[CMPXCHG_TRYSTORE:%.*]]
+; CHECK: cmpxchg.trystore:
+; CHECK-NEXT: [[LOADED_TRYSTORE:%.*]] = phi i32 [ [[LARX]], [[CMPXCHG_FENCEDSTORE]] ]
+; CHECK-NEXT: [[STCX:%.*]] = call i32 @llvm.ppc.stwcx(ptr [[PTR]], i32 [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[STCX]], 1
+; CHECK-NEXT: [[SUCCESS1:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS1]], label [[CMPXCHG_SUCCESS:%.*]], label [[CMPXCHG_START]], !prof [[PROF1]]
+; CHECK: cmpxchg.releasedload:
; CHECK-NEXT: unreachable
-; CHECK-EMPTY:
-; CHECK-NEXT: cmpxchg.success: ; preds = %cmpxchg.trystore
-; CHECK-NEXT: br label %cmpxchg.end
-; CHECK-EMPTY:
-; CHECK-NEXT: cmpxchg.nostore: ; preds = %cmpxchg.start
-; CHECK-NEXT: %loaded.nostore = phi i32 [ %larx, %cmpxchg.start ]
-; CHECK-NEXT: br label %cmpxchg.failure
-; CHECK-EMPTY:
-; CHECK-NEXT: cmpxchg.failure: ; preds = %cmpxchg.nostore
-; CHECK-NEXT: %loaded.failure = phi i32 [ %loaded.nostore, %cmpxchg.nostore ]
-; CHECK-NEXT: br label %cmpxchg.end
-; CHECK-EMPTY:
-; CHECK-NEXT: cmpxchg.end: ; preds = %cmpxchg.failure, %cmpxchg.success
-; CHECK-NEXT: %loaded.exit = phi i32 [ %loaded.trystore, %cmpxchg.success ], [ %loaded.failure, %cmpxchg.failure ]
-; CHECK-NEXT: %success2 = phi i1 [ true, %cmpxchg.success ], [ false, %cmpxchg.failure ]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 %loaded.exit to float
-; CHECK-NEXT: br i1 %success2, label %atomicrmw.end, label %atomicrmw.start
-; CHECK-EMPTY:
-; CHECK-NEXT: atomicrmw.end: ; preds = %cmpxchg.end
+; CHECK: cmpxchg.success:
+; CHECK-NEXT: br label [[CMPXCHG_END]]
+; CHECK: cmpxchg.nostore:
+; CHECK-NEXT: [[LOADED_NOSTORE:%.*]] = phi i32 [ [[LARX]], [[CMPXCHG_START]] ]
+; CHECK-NEXT: br label [[CMPXCHG_FAILURE:%.*]]
+; CHECK: cmpxchg.failure:
+; CHECK-NEXT: [[LOADED_FAILURE:%.*]] = phi i32 [ [[LOADED_NOSTORE]], [[CMPXCHG_NOSTORE]] ]
+; CHECK-NEXT: br label [[CMPXCHG_END]]
+; CHECK: cmpxchg.end:
+; CHECK-NEXT: [[LOADED_EXIT:%.*]] = phi i32 [ [[LOADED_TRYSTORE]], [[CMPXCHG_SUCCESS]] ], [ [[LOADED_FAILURE]], [[CMPXCHG_FAILURE]] ]
+; CHECK-NEXT: [[SUCCESS2:%.*]] = phi i1 [ true, [[CMPXCHG_SUCCESS]] ], [ false, [[CMPXCHG_FAILURE]] ]
+; CHECK-NEXT: [[TMP5]] = bitcast i32 [[LOADED_EXIT]] to float
+; CHECK-NEXT: br i1 [[SUCCESS2]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2:![0-9]+]]
+; CHECK: atomicrmw.end:
; CHECK-NEXT: call void @llvm.ppc.lwsync()
; CHECK-NEXT: ret float [[TMP5]]
-; CHECK-NEXT: }
+;
%res = atomicrmw fadd ptr %ptr, float %value seq_cst
ret float %res
}
@@ -60,54 +51,44 @@ define float @test_atomicrmw_fsub_f32(ptr %ptr, float %value) {
; CHECK-LABEL: @test_atomicrmw_fsub_f32(
; CHECK-NEXT: call void @llvm.ppc.sync()
; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
-; CHECK-NEXT: br label %atomicrmw.start
-; CHECK-EMPTY:
-; CHECK-NEXT: atomicrmw.start:
-; CHECK-NEXT: %loaded = phi float [ [[TMP1]], %0 ], [ [[TMP5:%.*]], %cmpxchg.end ]
-; CHECK-NEXT: %new = fsub float %loaded, %value
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast float %new to i32
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float %loaded to i32
-; CHECK-NEXT: br label %cmpxchg.start
-; CHECK-EMPTY:
-; CHECK-NEXT: cmpxchg.start:
-; CHECK-NEXT: %larx = call i32 @llvm.ppc.lwarx(ptr %ptr)
-; CHECK-NEXT: %should_store = icmp eq i32 %larx, [[TMP3]]
-; CHECK-NEXT: br i1 %should_store, label %cmpxchg.fencedstore, label %cmpxchg.nostore
-; CHECK-EMPTY:
-; CHECK-NEXT: cmpxchg.fencedstore: ; preds = %cmpxchg.start
-; CHECK-NEXT: br label %cmpxchg.trystore
-; CHECK-EMPTY:
-; CHECK-NEXT: cmpxchg.trystore: ; preds = %cmpxchg.fencedstore
-; CHECK-NEXT: %loaded.trystore = phi i32 [ %larx, %cmpxchg.fencedstore ]
-; CHECK-NEXT: %stcx = call i32 @llvm.ppc.stwcx(ptr %ptr, i32 %2)
-; CHECK-NEXT: [[TMP4:%.*]] = xor i32 %stcx, 1
-; CHECK-NEXT: %success1 = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT: br i1 %success1, label %cmpxchg.success, label %cmpxchg.start
-; CHECK-EMPTY:
-; CHECK-NEXT: cmpxchg.releasedload: ; No predecessors!
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[CMPXCHG_END:%.*]] ]
+; CHECK-NEXT: [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; CHECK-NEXT: br label [[CMPXCHG_START:%.*]]
+; CHECK: cmpxchg.start:
+; CHECK-NEXT: [[LARX:%.*]] = call i32 @llvm.ppc.lwarx(ptr [[PTR]])
+; CHECK-NEXT: [[SHOULD_STORE:%.*]] = icmp eq i32 [[LARX]], [[TMP3]]
+; CHECK-NEXT: br i1 [[SHOULD_STORE]], label [[CMPXCHG_FENCEDSTORE:%.*]], label [[CMPXCHG_NOSTORE:%.*]], !prof [[PROF1]]
+; CHECK: cmpxchg.fencedstore:
+; CHECK-NEXT: br label [[CMPXCHG_TRYSTORE:%.*]]
+; CHECK: cmpxchg.trystore:
+; CHECK-NEXT: [[LOADED_TRYSTORE:%.*]] = phi i32 [ [[LARX]], [[CMPXCHG_FENCEDSTORE]] ]
+; CHECK-NEXT: [[STCX:%.*]] = call i32 @llvm.ppc.stwcx(ptr [[PTR]], i32 [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[STCX]], 1
+; CHECK-NEXT: [[SUCCESS1:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS1]], label [[CMPXCHG_SUCCESS:%.*]], label [[CMPXCHG_START]], !prof [[PROF1]]
+; CHECK: cmpxchg.releasedload:
; CHECK-NEXT: unreachable
-; CHECK-EMPTY:
-; CHECK-NEXT: cmpxchg.success: ; preds = %cmpxchg.trystore
-; CHECK-NEXT: br label %cmpxchg.end
-; CHECK-EMPTY:
-; CHECK-NEXT: cmpxchg.nostore: ; preds = %cmpxchg.start
-; CHECK-NEXT: %loaded.nostore = phi i32 [ %larx, %cmpxchg.start ]
-; CHECK-NEXT: br label %cmpxchg.failure
-; CHECK-EMPTY:
-; CHECK-NEXT: cmpxchg.failure: ; preds = %cmpxchg.nostore
-; CHECK-NEXT: %loaded.failure = phi i32 [ %loaded.nostore, %cmpxchg.nostore ]
-; CHECK-NEXT: br label %cmpxchg.end
-; CHECK-EMPTY:
-; CHECK-NEXT: cmpxchg.end: ; preds = %cmpxchg.failure, %cmpxchg.success
-; CHECK-NEXT: %loaded.exit = phi i32 [ %loaded.trystore, %cmpxchg.success ], [ %loaded.failure, %cmpxchg.failure ]
-; CHECK-NEXT: %success2 = phi i1 [ true, %cmpxchg.success ], [ false, %cmpxchg.failure ]
-; CHECK-NEXT: [[TMP5]] = bitcast i32 %loaded.exit to float
-; CHECK-NEXT: br i1 %success2, label %atomicrmw.end, label %atomicrmw.start
-; CHECK-EMPTY:
-; CHECK-NEXT: atomicrmw.end: ; preds = %cmpxchg.end
+; CHECK: cmpxchg.success:
+; CHECK-NEXT: br label [[CMPXCHG_END]]
+; CHECK: cmpxchg.nostore:
+; CHECK-NEXT: [[LOADED_NOSTORE:%.*]] = phi i32 [ [[LARX]], [[CMPXCHG_START]] ]
+; CHECK-NEXT: br label [[CMPXCHG_FAILURE:%.*]]
+; CHECK: cmpxchg.failure:
+; CHECK-NEXT: [[LOADED_FAILURE:%.*]] = phi i32 [ [[LOADED_NOSTORE]], [[CMPXCHG_NOSTORE]] ]
+; CHECK-NEXT: br label [[CMPXCHG_END]]
+; CHECK: cmpxchg.end:
+; CHECK-NEXT: [[LOADED_EXIT:%.*]] = phi i32 [ [[LOADED_TRYSTORE]], [[CMPXCHG_SUCCESS]] ], [ [[LOADED_FAILURE]], [[CMPXCHG_FAILURE]] ]
+; CHECK-NEXT: [[SUCCESS2:%.*]] = phi i1 [ true, [[CMPXCHG_SUCCESS]] ], [ false, [[CMPXCHG_FAILURE]] ]
+; CHECK-NEXT: [[TMP5]] = bitcast i32 [[LOADED_EXIT]] to float
+; CHECK-NEXT: br i1 [[SUCCESS2]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF2]]
+; CHECK: atomicrmw.end:
; CHECK-NEXT: call void @llvm.ppc.lwsync()
; CHECK-NEXT: ret float [[TMP5]]
-; CHECK-NEXT: }
+;
%res = atomicrmw fsub ptr %ptr, float %value seq_cst
ret float %res
diff --git a/llvm/test/Transforms/AtomicExpand/RISCV/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/RISCV/atomicrmw-fp.ll
index 60fb2487faa13..94761ec336fdc 100644
--- a/llvm/test/Transforms/AtomicExpand/RISCV/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/RISCV/atomicrmw-fp.ll
@@ -19,7 +19,7 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { float, i1 } [[TMP6]], i1 [[TMP4]], 1
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { float, i1 } [[TMP7]], 1
; CHECK-NEXT: [[NEWLOADED]] = extractvalue { float, i1 } [[TMP7]], 0
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret float [[NEWLOADED]]
;
@@ -45,7 +45,7 @@ define float @test_atomicrmw_fsub_f32(ptr %ptr, float %value) {
; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { float, i1 } [[TMP6]], i1 [[TMP4]], 1
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { float, i1 } [[TMP7]], 1
; CHECK-NEXT: [[NEWLOADED]] = extractvalue { float, i1 } [[TMP7]], 0
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret float [[NEWLOADED]]
;
diff --git a/llvm/test/Transforms/AtomicExpand/SPARC/libcalls.ll b/llvm/test/Transforms/AtomicExpand/SPARC/libcalls.ll
index 1d6a32ca093f1..9363623d8dd43 100644
--- a/llvm/test/Transforms/AtomicExpand/SPARC/libcalls.ll
+++ b/llvm/test/Transforms/AtomicExpand/SPARC/libcalls.ll
@@ -13,30 +13,34 @@ target triple = "sparc-unknown-unknown"
;; straightforward.
; CHECK-LABEL: @test_load_i16(
+; CHECK: !prof [[PROF_0:![0-9]+]] {
; CHECK: %1 = call i16 @__atomic_load_2(ptr %arg, i32 5)
; CHECK: ret i16 %1
-define i16 @test_load_i16(ptr %arg) {
+define i16 @test_load_i16(ptr %arg) !prof !0 {
%ret = load atomic i16, ptr %arg seq_cst, align 4
ret i16 %ret
}
; CHECK-LABEL: @test_store_i16(
+; CHECK: !prof [[PROF_0]] {
; CHECK: call void @__atomic_store_2(ptr %arg, i16 %val, i32 5)
; CHECK: ret void
-define void @test_store_i16(ptr %arg, i16 %val) {
+define void @test_store_i16(ptr %arg, i16 %val) !prof !0 {
store atomic i16 %val, ptr %arg seq_cst, align 4
ret void
}
; CHECK-LABEL: @test_exchange_i16(
+; CHECK: !prof [[PROF_0]] {
; CHECK: %1 = call i16 @__atomic_exchange_2(ptr %arg, i16 %val, i32 5)
; CHECK: ret i16 %1
-define i16 @test_exchange_i16(ptr %arg, i16 %val) {
+define i16 @test_exchange_i16(ptr %arg, i16 %val) !prof !0 {
%ret = atomicrmw xchg ptr %arg, i16 %val seq_cst
ret i16 %ret
}
; CHECK-LABEL: @test_cmpxchg_i16(
+; CHECK: !prof [[PROF_0]] {
; CHECK: %1 = alloca i16, align 2
; CHECK: call void @llvm.lifetime.start.p0(ptr %1)
; CHECK: store i16 %old, ptr %1, align 2
@@ -47,16 +51,17 @@ define i16 @test_exchange_i16(ptr %arg, i16 %val) {
; CHECK: %5 = insertvalue { i16, i1 } %4, i1 %2, 1
; CHECK: %ret = extractvalue { i16, i1 } %5, 0
; CHECK: ret i16 %ret
-define i16 @test_cmpxchg_i16(ptr %arg, i16 %old, i16 %new) {
+define i16 @test_cmpxchg_i16(ptr %arg, i16 %old, i16 %new) !prof !0 {
%ret_succ = cmpxchg ptr %arg, i16 %old, i16 %new seq_cst monotonic
%ret = extractvalue { i16, i1 } %ret_succ, 0
ret i16 %ret
}
; CHECK-LABEL: @test_add_i16(
+; CHECK: !prof [[PROF_0]] {
; CHECK: %1 = call i16 @__atomic_fetch_add_2(ptr %arg, i16 %val, i32 5)
; CHECK: ret i16 %1
-define i16 @test_add_i16(ptr %arg, i16 %val) {
+define i16 @test_add_i16(ptr %arg, i16 %val) !prof !0 {
%ret = atomicrmw add ptr %arg, i16 %val seq_cst
ret i16 %ret
}
@@ -67,30 +72,33 @@ define i16 @test_add_i16(ptr %arg, i16 %val) {
;; 32-bit i386.
; CHECK-LABEL: @test_load_i128(
+; CHECK: !prof [[PROF_0]] {
; CHECK: %1 = alloca i128, align 8
; CHECK: call void @llvm.lifetime.start.p0(ptr %1)
; CHECK: call void @__atomic_load(i32 16, ptr %arg, ptr %1, i32 5)
; CHECK: %2 = load i128, ptr %1, align 8
; CHECK: call void @llvm.lifetime.end.p0(ptr %1)
; CHECK: ret i128 %2
-define i128 @test_load_i128(ptr %arg) {
+define i128 @test_load_i128(ptr %arg) !prof !0 {
%ret = load atomic i128, ptr %arg seq_cst, align 16
ret i128 %ret
}
; CHECK-LABEL: @test_store_i128(
+; CHECK: !prof [[PROF_0]] {
; CHECK: %1 = alloca i128, align 8
; CHECK: call void @llvm.lifetime.start.p0(ptr %1)
; CHECK: store i128 %val, ptr %1, align 8
; CHECK: call void @__atomic_store(i32 16, ptr %arg, ptr %1, i32 5)
; CHECK: call void @llvm.lifetime.end.p0(ptr %1)
; CHECK: ret void
-define void @test_store_i128(ptr %arg, i128 %val) {
+define void @test_store_i128(ptr %arg, i128 %val) !prof !0 {
store atomic i128 %val, ptr %arg seq_cst, align 16
ret void
}
; CHECK-LABEL: @test_exchange_i128(
+; CHECK: !prof [[PROF_0]] {
; CHECK: %1 = alloca i128, align 8
; CHECK: call void @llvm.lifetime.start.p0(ptr %1)
; CHECK: store i128 %val, ptr %1, align 8
@@ -101,12 +109,13 @@ define void @test_store_i128(ptr %arg, i128 %val) {
; CHECK: %3 = load i128, ptr %2, align 8
; CHECK: call void @llvm.lifetime.end.p0(ptr %2)
; CHECK: ret i128 %3
-define i128 @test_exchange_i128(ptr %arg, i128 %val) {
+define i128 @test_exchange_i128(ptr %arg, i128 %val) !prof !0 {
%ret = atomicrmw xchg ptr %arg, i128 %val seq_cst
ret i128 %ret
}
; CHECK-LABEL: @test_cmpxchg_i128(
+; CHECK: !prof [[PROF_0]] {
; CHECK: %1 = alloca i128, align 8
; CHECK: call void @llvm.lifetime.start.p0(ptr %1)
; CHECK: store i128 %old, ptr %1, align 8
@@ -121,7 +130,7 @@ define i128 @test_exchange_i128(ptr %arg, i128 %val) {
; CHECK: %6 = insertvalue { i128, i1 } %5, i1 %3, 1
; CHECK: %ret = extractvalue { i128, i1 } %6, 0
; CHECK: ret i128 %ret
-define i128 @test_cmpxchg_i128(ptr %arg, i128 %old, i128 %new) {
+define i128 @test_cmpxchg_i128(ptr %arg, i128 %old, i128 %new) !prof !0 {
%ret_succ = cmpxchg ptr %arg, i128 %old, i128 %new seq_cst monotonic
%ret = extractvalue { i128, i1 } %ret_succ, 0
ret i128 %ret
@@ -132,6 +141,7 @@ define i128 @test_cmpxchg_i128(ptr %arg, i128 %old, i128 %new) {
; loop, which then itself expands into a libcall.
; CHECK-LABEL: @test_add_i128(
+; CHECK: !prof [[PROF_0]] {
; CHECK: %1 = alloca i128, align 8
; CHECK: %2 = alloca i128, align 8
; CHECK: %3 = load i128, ptr %arg, align 16
@@ -151,10 +161,10 @@ define i128 @test_cmpxchg_i128(ptr %arg, i128 %old, i128 %new) {
; CHECK: %7 = insertvalue { i128, i1 } %6, i1 %4, 1
; CHECK: %success = extractvalue { i128, i1 } %7, 1
; CHECK: %newloaded = extractvalue { i128, i1 } %7, 0
-; CHECK: br i1 %success, label %atomicrmw.end, label %atomicrmw.start
+; CHECK: br i1 %success, label %atomicrmw.end, label %atomicrmw.start, !prof [[PROF_1:![0-9]+]]
; CHECK:atomicrmw.end:
; CHECK: ret i128 %newloaded
-define i128 @test_add_i128(ptr %arg, i128 %val) {
+define i128 @test_add_i128(ptr %arg, i128 %val) !prof !0 {
%ret = atomicrmw add ptr %arg, i128 %val seq_cst
ret i128 %ret
}
@@ -162,24 +172,27 @@ define i128 @test_add_i128(ptr %arg, i128 %val) {
;; Ensure that non-integer types get bitcast correctly on the way in and out of a libcall:
; CHECK-LABEL: @test_load_double(
+; CHECK: !prof [[PROF_0]] {
; CHECK: %1 = call i64 @__atomic_load_8(ptr %arg, i32 5)
; CHECK: %2 = bitcast i64 %1 to double
; CHECK: ret double %2
-define double @test_load_double(ptr %arg, double %val) {
+define double @test_load_double(ptr %arg, double %val) !prof !0 {
%1 = load atomic double, ptr %arg seq_cst, align 16
ret double %1
}
; CHECK-LABEL: @test_store_double(
+; CHECK: !prof [[PROF_0]] {
; CHECK: %1 = bitcast double %val to i64
; CHECK: call void @__atomic_store_8(ptr %arg, i64 %1, i32 5)
; CHECK: ret void
-define void @test_store_double(ptr %arg, double %val) {
+define void @test_store_double(ptr %arg, double %val) !prof !0 {
store atomic double %val, ptr %arg seq_cst, align 16
ret void
}
; CHECK-LABEL: @test_cmpxchg_ptr(
+; CHECK: !prof [[PROF_0]] {
; CHECK: %1 = alloca ptr, align 4
; CHECK: call void @llvm.lifetime.start.p0(ptr %1)
; CHECK: store ptr %old, ptr %1, align 4
@@ -192,7 +205,7 @@ define void @test_store_double(ptr %arg, double %val) {
; CHECK: %ret = extractvalue { ptr, i1 } %6, 0
; CHECK: ret ptr %ret
; CHECK: }
-define ptr @test_cmpxchg_ptr(ptr %arg, ptr %old, ptr %new) {
+define ptr @test_cmpxchg_ptr(ptr %arg, ptr %old, ptr %new) !prof !0 {
%ret_succ = cmpxchg ptr %arg, ptr %old, ptr %new seq_cst acquire
%ret = extractvalue { ptr, i1 } %ret_succ, 0
ret ptr %ret
@@ -201,13 +214,14 @@ define ptr @test_cmpxchg_ptr(ptr %arg, ptr %old, ptr %new) {
;; ...and for a non-integer type of large size too.
; CHECK-LABEL: @test_store_fp128
+; CHECK: !prof [[PROF_0]] {
; CHECK: %1 = alloca fp128, align 8
; CHECK: call void @llvm.lifetime.start.p0(ptr %1)
; CHECK: store fp128 %val, ptr %1, align 8
; CHECK: call void @__atomic_store(i32 16, ptr %arg, ptr %1, i32 5)
; CHECK: call void @llvm.lifetime.end.p0(ptr %1)
; CHECK: ret void
-define void @test_store_fp128(ptr %arg, fp128 %val) {
+define void @test_store_fp128(ptr %arg, fp128 %val) !prof !0 {
store atomic fp128 %val, ptr %arg seq_cst, align 16
ret void
}
@@ -218,15 +232,21 @@ define void @test_store_fp128(ptr %arg, fp128 %val) {
;; when such support is added, they should also be tested here.
; CHECK-LABEL: @test_unaligned_load_i16(
+; CHECK: !prof [[PROF_0]] {
; CHECK: __atomic_load(
-define i16 @test_unaligned_load_i16(ptr %arg) {
+define i16 @test_unaligned_load_i16(ptr %arg) !prof !0 {
%ret = load atomic i16, ptr %arg seq_cst, align 1
ret i16 %ret
}
; CHECK-LABEL: @test_unaligned_store_i16(
+; CHECK: !prof [[PROF_0]] {
; CHECK: __atomic_store(
-define void @test_unaligned_store_i16(ptr %arg, i16 %val) {
+define void @test_unaligned_store_i16(ptr %arg, i16 %val) !prof !0 {
store atomic i16 %val, ptr %arg seq_cst, align 1
ret void
}
+
+!0 = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_1]] = !{!"unknown", !"atomic-expand"}
\ No newline at end of file
diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-fp.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-fp.ll
index 024b2e33af5da..7d1645570e92a 100644
--- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-fp.ll
@@ -14,7 +14,7 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret float [[TMP5]]
;
@@ -35,7 +35,7 @@ define double @test_atomicrmw_fadd_f64(ptr %ptr, double %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret double [[TMP5]]
;
@@ -56,7 +56,7 @@ define float @test_atomicrmw_fadd_f32_as1(ptr addrspace(1) %ptr, float %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret float [[TMP5]]
;
@@ -77,7 +77,7 @@ define float @test_atomicrmw_fsub_f32(ptr %ptr, float %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret float [[TMP5]]
;
@@ -98,7 +98,7 @@ define double @test_atomicrmw_fsub_f64(ptr %ptr, double %value) {
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
; CHECK-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: ret double [[TMP5]]
;
diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-initial-load.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-initial-load.ll
index 316660ddfc49e..66d117ddba032 100644
--- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-initial-load.ll
+++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-initial-load.ll
@@ -3,9 +3,13 @@
; This file tests the function `llvm::expandAtomicRMWToCmpXchg`.
; It isn't technically target specific, but is exposed through a pass that is.
-define i8 @test_initial_load(ptr %ptr, i8 %value) {
+define i8 @test_initial_load(ptr %ptr, i8 %value) !prof !0 {
%res = atomicrmw nand ptr %ptr, i8 %value seq_cst
ret i8 %res
}
; CHECK-LABEL: @test_initial_load
+; CHECK: !prof [[PROF_0:![0-9]+]] {
; CHECK-NEXT: %1 = load i8, ptr %ptr, align 1
+
+!0 = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000}
\ No newline at end of file
diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
index 2341eb05c568c..2d370e47df63d 100644
--- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
@@ -10,7 +10,7 @@ define double @atomic_xchg_f64(ptr %ptr) nounwind {
; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 4616189618054758400 seq_cst seq_cst, align 8
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[NEWLOADED]] to double
; CHECK-NEXT: ret double [[TMP3]]
@@ -28,7 +28,7 @@ define double @atomic_xchg_f64_as1(ptr addrspace(1) %ptr) nounwind {
; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 4616189618054758400 seq_cst seq_cst, align 8
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[NEWLOADED]] to double
; CHECK-NEXT: ret double [[TMP3]]
@@ -39,14 +39,14 @@ define double @atomic_xchg_f64_as1(ptr addrspace(1) %ptr) nounwind {
define double @atomic_xchg_f64_preserve_md(ptr %ptr) nounwind {
; CHECK-LABEL: @atomic_xchg_f64_preserve_md(
-; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[PTR:%.*]], align 8, !mmra [[META0:![0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[PTR:%.*]], align 8, !mmra [[META2:![0-9]+]]
; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
; CHECK: atomicrmw.start:
; CHECK-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 4616189618054758400 seq_cst seq_cst, align 8, !mmra [[META0]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 4616189618054758400 seq_cst seq_cst, align 8, !mmra [[META2]]
; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0
-; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[NEWLOADED]] to double
; CHECK-NEXT: ret double [[TMP3]]
More information about the llvm-commits
mailing list