[clang] [clang] Add support for cluster sync scope (PR #162575)
via cfe-commits
cfe-commits at lists.llvm.org
Tue Oct 21 02:48:32 PDT 2025
https://github.com/macurtis-amd updated https://github.com/llvm/llvm-project/pull/162575
>From 129bf4aae1ef3aae32b1b6903edb9fab07b2cead Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis at amd.com>
Date: Mon, 20 Oct 2025 11:12:24 -0500
Subject: [PATCH 1/2] [clang] Add support for cluster sync scope
---
clang/docs/HIPSupport.rst | 2 +
clang/docs/LanguageExtensions.rst | 1 +
clang/include/clang/Basic/SyncScope.h | 45 +-
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 29 +-
clang/lib/CodeGen/Targets/AMDGPU.cpp | 4 +
clang/lib/CodeGen/Targets/SPIR.cpp | 2 +
clang/lib/Frontend/InitPreprocessor.cpp | 2 +
clang/test/CodeGen/scoped-atomic-ops.c | 4483 ++++++++++++++++-
clang/test/CodeGen/scoped-fence-ops.c | 41 +-
.../test/CodeGenOpenCL/builtins-amdgcn-vi.cl | 59 +-
clang/test/Preprocessor/init-aarch64.c | 2 +
clang/test/Preprocessor/init-loongarch.c | 22 +-
clang/test/Preprocessor/init.c | 17 +-
clang/test/SemaCUDA/atomic-ops.cu | 9 +-
.../test/SemaCUDA/spirv-amdgcn-atomic-ops.cu | 9 +-
15 files changed, 4462 insertions(+), 265 deletions(-)
diff --git a/clang/docs/HIPSupport.rst b/clang/docs/HIPSupport.rst
index b4a671e3cfa3c..ec2af2a6f569d 100644
--- a/clang/docs/HIPSupport.rst
+++ b/clang/docs/HIPSupport.rst
@@ -164,6 +164,8 @@ Predefined Macros
- Represents wavefront memory scope in HIP (value is 2).
* - ``__HIP_MEMORY_SCOPE_WORKGROUP``
- Represents workgroup memory scope in HIP (value is 3).
+ * - ``__HIP_MEMORY_SCOPE_CLUSTER``
+ - Represents cluster memory scope in HIP (value is 6).
* - ``__HIP_MEMORY_SCOPE_AGENT``
- Represents agent memory scope in HIP (value is 4).
* - ``__HIP_MEMORY_SCOPE_SYSTEM``
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 6bb99c757cd19..bef6e9c14b182 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -4846,6 +4846,7 @@ currently supported:
* ``__MEMORY_SCOPE_SYSTEM``
* ``__MEMORY_SCOPE_DEVICE``
* ``__MEMORY_SCOPE_WRKGRP``
+* ``__MEMORY_SCOPE_CLUSTR``
* ``__MEMORY_SCOPE_WVFRNT``
* ``__MEMORY_SCOPE_SINGLE``
diff --git a/clang/include/clang/Basic/SyncScope.h b/clang/include/clang/Basic/SyncScope.h
index 5a8d2a7dd02e5..7776c3d83a77d 100644
--- a/clang/include/clang/Basic/SyncScope.h
+++ b/clang/include/clang/Basic/SyncScope.h
@@ -43,11 +43,13 @@ enum class SyncScope {
SystemScope,
DeviceScope,
WorkgroupScope,
+ ClusterScope,
WavefrontScope,
SingleScope,
HIPSingleThread,
HIPWavefront,
HIPWorkgroup,
+ HIPCluster,
HIPAgent,
HIPSystem,
OpenCLWorkGroup,
@@ -65,6 +67,8 @@ inline llvm::StringRef getAsString(SyncScope S) {
return "device_scope";
case SyncScope::WorkgroupScope:
return "workgroup_scope";
+ case SyncScope::ClusterScope:
+ return "cluster_scope";
case SyncScope::WavefrontScope:
return "wavefront_scope";
case SyncScope::SingleScope:
@@ -75,6 +79,8 @@ inline llvm::StringRef getAsString(SyncScope S) {
return "hip_wavefront";
case SyncScope::HIPWorkgroup:
return "hip_workgroup";
+ case SyncScope::HIPCluster:
+ return "hip_cluster";
case SyncScope::HIPAgent:
return "hip_agent";
case SyncScope::HIPSystem:
@@ -174,13 +180,18 @@ class AtomicScopeHIPModel : public AtomicScopeModel {
/// The enum values match the pre-defined macros
/// __HIP_MEMORY_SCOPE_*, which are used to define memory_scope_*
/// enums in hip-c.h.
+ /// These may be present in pch files or bitcode so preserve existing values
+ /// when adding a new ID.
enum ID {
SingleThread = 1,
Wavefront = 2,
Workgroup = 3,
Agent = 4,
System = 5,
- Last = System
+ Cluster = 6,
+ End,
+ Last = End - 1,
+ Count = Last
};
AtomicScopeHIPModel() {}
@@ -193,10 +204,14 @@ class AtomicScopeHIPModel : public AtomicScopeModel {
return SyncScope::HIPWavefront;
case Workgroup:
return SyncScope::HIPWorkgroup;
+ case Cluster:
+ return SyncScope::HIPCluster;
case Agent:
return SyncScope::HIPAgent;
case System:
return SyncScope::HIPSystem;
+ case End:
+ break;
}
llvm_unreachable("Invalid language sync scope value");
}
@@ -207,11 +222,12 @@ class AtomicScopeHIPModel : public AtomicScopeModel {
}
ArrayRef<unsigned> getRuntimeValues() const override {
- static_assert(Last == System, "Does not include all sync scopes");
static const unsigned Scopes[] = {
static_cast<unsigned>(SingleThread), static_cast<unsigned>(Wavefront),
- static_cast<unsigned>(Workgroup), static_cast<unsigned>(Agent),
- static_cast<unsigned>(System)};
+ static_cast<unsigned>(Workgroup), static_cast<unsigned>(Cluster),
+ static_cast<unsigned>(System), static_cast<unsigned>(Agent)};
+ static_assert(sizeof(Scopes) / sizeof(Scopes[0]) == Count,
+ "Does not include all sync scopes");
return llvm::ArrayRef(Scopes);
}
@@ -223,14 +239,18 @@ class AtomicScopeHIPModel : public AtomicScopeModel {
/// Defines the generic atomic scope model.
class AtomicScopeGenericModel : public AtomicScopeModel {
public:
- /// The enum values match predefined built-in macros __ATOMIC_SCOPE_*.
+ /// The enum values match predefined built-in macros __MEMORY_SCOPE_*.
+ /// These may be present in pch files or bitcode so preserve existing values
+ /// when adding a new ID.
enum ID {
System = 0,
Device = 1,
Workgroup = 2,
Wavefront = 3,
Single = 4,
- Last = Single
+ Cluster = 5,
+ Count,
+ Last = Count - 1
};
AtomicScopeGenericModel() = default;
@@ -243,10 +263,14 @@ class AtomicScopeGenericModel : public AtomicScopeModel {
return SyncScope::SystemScope;
case Workgroup:
return SyncScope::WorkgroupScope;
+ case Cluster:
+ return SyncScope::ClusterScope;
case Wavefront:
return SyncScope::WavefrontScope;
case Single:
return SyncScope::SingleScope;
+ case Count:
+ break;
}
llvm_unreachable("Invalid language sync scope value");
}
@@ -256,11 +280,12 @@ class AtomicScopeGenericModel : public AtomicScopeModel {
}
ArrayRef<unsigned> getRuntimeValues() const override {
- static_assert(Last == Single, "Does not include all sync scopes");
static const unsigned Scopes[] = {
- static_cast<unsigned>(Device), static_cast<unsigned>(System),
- static_cast<unsigned>(Workgroup), static_cast<unsigned>(Wavefront),
- static_cast<unsigned>(Single)};
+ static_cast<unsigned>(System), static_cast<unsigned>(Device),
+ static_cast<unsigned>(Workgroup), static_cast<unsigned>(Cluster),
+ static_cast<unsigned>(Wavefront), static_cast<unsigned>(Single)};
+ static_assert(sizeof(Scopes) / sizeof(Scopes[0]) == Count,
+ "Does not include all sync scopes");
return llvm::ArrayRef(Scopes);
}
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 5049a0ab0a395..6f281c7a3f843 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -12,6 +12,7 @@
#include "CGBuiltin.h"
#include "CodeGenFunction.h"
+#include "clang/Basic/SyncScope.h"
#include "clang/Basic/TargetBuiltins.h"
#include "clang/Frontend/FrontendDiagnostic.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -313,33 +314,33 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
}
// Older builtins had an enum argument for the memory scope.
+ const char *ssn = nullptr;
int scope = cast<llvm::ConstantInt>(Scope)->getZExtValue();
switch (scope) {
- case 0: // __MEMORY_SCOPE_SYSTEM
+ case AtomicScopeGenericModel::System: // __MEMORY_SCOPE_SYSTEM
SSID = llvm::SyncScope::System;
break;
- case 1: // __MEMORY_SCOPE_DEVICE
- if (getTarget().getTriple().isSPIRV())
- SSID = getLLVMContext().getOrInsertSyncScopeID("device");
- else
- SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
+ case AtomicScopeGenericModel::Device: // __MEMORY_SCOPE_DEVICE
+ ssn = getTarget().getTriple().isSPIRV() ? "device" : "agent";
break;
- case 2: // __MEMORY_SCOPE_WRKGRP
- SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup");
+ case AtomicScopeGenericModel::Workgroup: // __MEMORY_SCOPE_WRKGRP
+ ssn = "workgroup";
break;
- case 3: // __MEMORY_SCOPE_WVFRNT
- if (getTarget().getTriple().isSPIRV())
- SSID = getLLVMContext().getOrInsertSyncScopeID("subgroup");
- else
- SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront");
+ case AtomicScopeGenericModel::Cluster: // __MEMORY_SCOPE_CLUSTR
+ ssn = getTarget().getTriple().isSPIRV() ? "workgroup" : "cluster";
+ break;
+ case AtomicScopeGenericModel::Wavefront: // __MEMORY_SCOPE_WVFRNT
+ ssn = getTarget().getTriple().isSPIRV() ? "subgroup" : "wavefront";
break;
- case 4: // __MEMORY_SCOPE_SINGLE
+ case AtomicScopeGenericModel::Single: // __MEMORY_SCOPE_SINGLE
SSID = llvm::SyncScope::SingleThread;
break;
default:
SSID = llvm::SyncScope::System;
break;
}
+ if (ssn)
+ SSID = getLLVMContext().getOrInsertSyncScopeID(ssn);
}
llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 16d5919d62cbb..0bc4b4b7025f2 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -508,6 +508,10 @@ AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
case SyncScope::WavefrontScope:
Name = "wavefront";
break;
+ case SyncScope::HIPCluster:
+ case SyncScope::ClusterScope:
+ Name = "cluster";
+ break;
case SyncScope::HIPWorkgroup:
case SyncScope::OpenCLWorkGroup:
case SyncScope::WorkgroupScope:
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
index 3f6d4e0a9277a..80e096ecf5ae9 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -93,6 +93,8 @@ inline StringRef mapClangSyncScopeToLLVM(SyncScope Scope) {
case SyncScope::OpenCLSubGroup:
case SyncScope::WavefrontScope:
return "subgroup";
+ case SyncScope::HIPCluster:
+ case SyncScope::ClusterScope:
case SyncScope::HIPWorkgroup:
case SyncScope::OpenCLWorkGroup:
case SyncScope::WorkgroupScope:
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index baad63179d89a..47f1d5a6b636c 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -585,6 +585,7 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
Builder.defineMacro("__HIP_MEMORY_SCOPE_WORKGROUP", "3");
Builder.defineMacro("__HIP_MEMORY_SCOPE_AGENT", "4");
Builder.defineMacro("__HIP_MEMORY_SCOPE_SYSTEM", "5");
+ Builder.defineMacro("__HIP_MEMORY_SCOPE_CLUSTER", "6");
if (LangOpts.HIPStdPar) {
Builder.defineMacro("__HIPSTDPAR__");
if (LangOpts.HIPStdParInterposeAlloc) {
@@ -873,6 +874,7 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
Builder.defineMacro("__MEMORY_SCOPE_WRKGRP", "2");
Builder.defineMacro("__MEMORY_SCOPE_WVFRNT", "3");
Builder.defineMacro("__MEMORY_SCOPE_SINGLE", "4");
+ Builder.defineMacro("__MEMORY_SCOPE_CLUSTR", "5");
// Define macros for the OpenCL memory scope.
// The values should match AtomicScopeOpenCLModel::ID enum.
diff --git a/clang/test/CodeGen/scoped-atomic-ops.c b/clang/test/CodeGen/scoped-atomic-ops.c
index 545a6c90892c2..c39048120a457 100644
--- a/clang/test/CodeGen/scoped-atomic-ops.c
+++ b/clang/test/CodeGen/scoped-atomic-ops.c
@@ -1,113 +1,772 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
// RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
-// RUN: -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
+// RUN: -fvisibility=hidden | FileCheck --check-prefixes=AMDGCN,AMDGCN_CL_DEF %s
// RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
-// RUN: -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
+// RUN: -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefixes=AMDGCN,AMDGCN_CL_20 %s
// RUN: %clang_cc1 %s -emit-llvm -o - -triple=spirv64-unknown-unknown -ffreestanding \
// RUN: -fvisibility=hidden | FileCheck --check-prefix=SPIRV %s
-// AMDGCN-LABEL: define hidden i32 @fi1a(
-// AMDGCN: [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:.+]] monotonic, align 4
-// AMDGCN: [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:.+]] syncscope("agent") monotonic, align 4
-// AMDGCN: [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN: [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN: [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:.+]] syncscope("singlethread") monotonic, align 4
-// SPIRV: define hidden spir_func i32 @fi1a(
-// SPIRV: [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:.+]] monotonic, align 4
-// SPIRV: [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:.+]] syncscope("device") monotonic, align 4
-// SPIRV: [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV: [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV: [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi1a(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[V:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[V_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP1]], ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load atomic i32, ptr [[TMP2]] syncscope("agent") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP5]], ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = load atomic i32, ptr [[TMP6]] syncscope("cluster") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("wavefront") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP9]], ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load atomic i32, ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP11]], ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = load i32, ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: ret i32 [[TMP12]]
+//
+// AMDGCN_CL_20-LABEL: define hidden i32 @fi1a(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[V:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load atomic i32, ptr [[TMP2]] syncscope("agent") monotonic, align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load atomic i32, ptr [[TMP6]] syncscope("cluster") monotonic, align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("wavefront") monotonic, align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP9]], ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load atomic i32, ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_20-NEXT: store i32 [[TMP11]], ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT: ret i32 [[TMP12]]
+//
+// SPIRV-LABEL: define hidden spir_func i32 @fi1a(
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[V:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP1]], ptr [[V]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP3:%.*]] = load atomic i32, ptr [[TMP2]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP3]], ptr [[V]], align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP5]], ptr [[V]], align 4
+// SPIRV-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP7:%.*]] = load atomic i32, ptr [[TMP6]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP7]], ptr [[V]], align 4
+// SPIRV-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP9]], ptr [[V]], align 4
+// SPIRV-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP11:%.*]] = load atomic i32, ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP11]], ptr [[V]], align 4
+// SPIRV-NEXT: [[TMP12:%.*]] = load i32, ptr [[V]], align 4
+// SPIRV-NEXT: ret i32 [[TMP12]]
+//
int fi1a(int *i) {
int v;
__scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
__scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
__scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP);
+ __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
__scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT);
__scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE);
return v;
}
// AMDGCN-LABEL: define hidden i32 @fi1b(
-// AMDGCN: [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:%.+]] monotonic, align 4
-// AMDGCN: [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:%.+]] syncscope("agent") monotonic, align 4
-// AMDGCN: [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN: [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:%.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN: [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT: [[ATOMIC_TEMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN-NEXT: [[ATOMIC_TEMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP1]] to ptr
+// AMDGCN-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN-NEXT: [[ATOMIC_TEMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP3]] to ptr
+// AMDGCN-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN-NEXT: [[ATOMIC_TEMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP5]] to ptr
+// AMDGCN-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
+// AMDGCN-NEXT: store i32 [[TMP1]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP2:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: store i32 [[TMP2]], ptr [[TMP3]], align 4
+// AMDGCN-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("agent") monotonic, align 4
+// AMDGCN-NEXT: store i32 [[TMP5]], ptr [[ATOMIC_TEMP1_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP6:%.*]] = load i32, ptr [[ATOMIC_TEMP1_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP7:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: store i32 [[TMP6]], ptr [[TMP7]], align 4
+// AMDGCN-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("workgroup") monotonic, align 4
+// AMDGCN-NEXT: store i32 [[TMP9]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP10:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP11:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: store i32 [[TMP10]], ptr [[TMP11]], align 4
+// AMDGCN-NEXT: [[TMP12:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: [[TMP13:%.*]] = load atomic i32, ptr [[TMP12]] syncscope("cluster") monotonic, align 4
+// AMDGCN-NEXT: store i32 [[TMP13]], ptr [[ATOMIC_TEMP3_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP14:%.*]] = load i32, ptr [[ATOMIC_TEMP3_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP15:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: store i32 [[TMP14]], ptr [[TMP15]], align 4
+// AMDGCN-NEXT: [[TMP16:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: [[TMP17:%.*]] = load atomic i32, ptr [[TMP16]] syncscope("wavefront") monotonic, align 4
+// AMDGCN-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP19:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN-NEXT: [[TMP20:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: [[TMP21:%.*]] = load atomic i32, ptr [[TMP20]] syncscope("singlethread") monotonic, align 4
+// AMDGCN-NEXT: store i32 [[TMP21]], ptr [[ATOMIC_TEMP5_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP22:%.*]] = load i32, ptr [[ATOMIC_TEMP5_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP23:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: store i32 [[TMP22]], ptr [[TMP23]], align 4
+// AMDGCN-NEXT: [[TMP24:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// AMDGCN-NEXT: ret i32 [[TMP25]]
+//
// SPIRV-LABEL: define hidden spir_func i32 @fi1b(
-// SPIRV: [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:%.+]] monotonic, align 4
-// SPIRV: [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:%.+]] syncscope("device") monotonic, align 4
-// SPIRV: [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV: [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:%.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV: [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP5:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP1]], ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP2]], ptr [[TMP3]], align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP5]], ptr [[ATOMIC_TEMP1]], align 4
+// SPIRV-NEXT: [[TMP6:%.*]] = load i32, ptr [[ATOMIC_TEMP1]], align 4
+// SPIRV-NEXT: [[TMP7:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP6]], ptr [[TMP7]], align 4
+// SPIRV-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP9]], ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT: [[TMP10:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT: [[TMP11:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP10]], ptr [[TMP11]], align 4
+// SPIRV-NEXT: [[TMP12:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP13:%.*]] = load atomic i32, ptr [[TMP12]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP13]], ptr [[ATOMIC_TEMP3]], align 4
+// SPIRV-NEXT: [[TMP14:%.*]] = load i32, ptr [[ATOMIC_TEMP3]], align 4
+// SPIRV-NEXT: [[TMP15:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP14]], ptr [[TMP15]], align 4
+// SPIRV-NEXT: [[TMP16:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP17:%.*]] = load atomic i32, ptr [[TMP16]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT: [[TMP19:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// SPIRV-NEXT: [[TMP20:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP21:%.*]] = load atomic i32, ptr [[TMP20]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP21]], ptr [[ATOMIC_TEMP5]], align 4
+// SPIRV-NEXT: [[TMP22:%.*]] = load i32, ptr [[ATOMIC_TEMP5]], align 4
+// SPIRV-NEXT: [[TMP23:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP22]], ptr [[TMP23]], align 4
+// SPIRV-NEXT: [[TMP24:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// SPIRV-NEXT: ret i32 [[TMP25]]
+//
int fi1b(int *i) {
*i = __scoped_atomic_load_n(i, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
*i = __scoped_atomic_load_n(i, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
*i = __scoped_atomic_load_n(i, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP);
+ *i = __scoped_atomic_load_n(i, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
*i = __scoped_atomic_load_n(i, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT);
*i = __scoped_atomic_load_n(i, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE);
return *i;
}
-// AMDGCN-LABEL: define hidden void @fi2a(
-// AMDGCN: store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4
-// AMDGCN: store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("agent") monotonic, align 4
-// AMDGCN: store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN: store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN: store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden void @fi2a(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[V:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[V_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("agent") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load i32, ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = load i32, ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("cluster") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load i32, ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("wavefront") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT: ret void
+//
+// AMDGCN_CL_20-LABEL: define hidden void @fi2a(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[V:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT: store atomic i32 [[TMP2]], ptr [[TMP0]] monotonic, align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// AMDGCN_CL_20-NEXT: store atomic i32 [[TMP5]], ptr [[TMP3]] syncscope("agent") monotonic, align 4
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
+// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// AMDGCN_CL_20-NEXT: store atomic i32 [[TMP8]], ptr [[TMP6]] syncscope("workgroup") monotonic, align 4
+// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
+// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+// AMDGCN_CL_20-NEXT: store atomic i32 [[TMP11]], ptr [[TMP9]] syncscope("cluster") monotonic, align 4
+// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
+// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
+// AMDGCN_CL_20-NEXT: store atomic i32 [[TMP14]], ptr [[TMP12]] syncscope("wavefront") monotonic, align 4
+// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
+// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+// AMDGCN_CL_20-NEXT: store atomic i32 [[TMP17]], ptr [[TMP15]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_20-NEXT: ret void
+//
// SPIRV-LABEL: define hidden spir_func void @fi2a(
-// SPIRV: store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4
-// SPIRV: store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("device") monotonic, align 4
-// SPIRV: store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV: store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV: store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[V:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[V]], align 4
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[V]], align 4
+// SPIRV-NEXT: store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP3:%.*]] = load i32, ptr [[V]], align 4
+// SPIRV-NEXT: store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP5:%.*]] = load i32, ptr [[V]], align 4
+// SPIRV-NEXT: store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP7:%.*]] = load i32, ptr [[V]], align 4
+// SPIRV-NEXT: store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP9:%.*]] = load i32, ptr [[V]], align 4
+// SPIRV-NEXT: store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP11:%.*]] = load i32, ptr [[V]], align 4
+// SPIRV-NEXT: store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT: ret void
+//
void fi2a(int *i) {
int v = 1;
__scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
__scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
__scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP);
+ __scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
__scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT);
__scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE);
}
// AMDGCN-LABEL: define hidden void @fi2b(
-// AMDGCN: store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4
-// AMDGCN: store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("agent") monotonic, align 4
-// AMDGCN: store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN: store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN: store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT: [[ENTRY:.*:]]
+// AMDGCN-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT: [[DOTATOMICTMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT: [[DOTATOMICTMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN-NEXT: [[DOTATOMICTMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP2]] to ptr
+// AMDGCN-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN-NEXT: [[DOTATOMICTMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP4]] to ptr
+// AMDGCN-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN-NEXT: store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4
+// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN-NEXT: store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("agent") monotonic, align 4
+// AMDGCN-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: store i32 1, ptr [[DOTATOMICTMP2_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTATOMICTMP2_ASCAST]], align 4
+// AMDGCN-NEXT: store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// AMDGCN-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN-NEXT: store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("cluster") monotonic, align 4
+// AMDGCN-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: store i32 1, ptr [[DOTATOMICTMP4_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTATOMICTMP4_ASCAST]], align 4
+// AMDGCN-NEXT: store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("wavefront") monotonic, align 4
+// AMDGCN-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN-NEXT: store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// AMDGCN-NEXT: ret void
+//
// SPIRV-LABEL: define hidden spir_func void @fi2b(
-// SPIRV: store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4
-// SPIRV: store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("device") monotonic, align 4
-// SPIRV: store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV: store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV: store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP2:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP4:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT: store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP2]], align 4
+// SPIRV-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTATOMICTMP2]], align 4
+// SPIRV-NEXT: store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT: store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP4]], align 4
+// SPIRV-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTATOMICTMP4]], align 4
+// SPIRV-NEXT: store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT: store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT: ret void
+//
void fi2b(int *i) {
__scoped_atomic_store_n(i, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
__scoped_atomic_store_n(i, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
__scoped_atomic_store_n(i, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP);
+ __scoped_atomic_store_n(i, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
__scoped_atomic_store_n(i, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT);
__scoped_atomic_store_n(i, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE);
}
-// AMDGCN-LABEL: define hidden void @fi3a(
-// AMDGCN: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4
-// AMDGCN: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] monotonic, align 4
-// AMDGCN: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] monotonic, align 4
-// AMDGCN: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] monotonic, align 4
-// AMDGCN: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] monotonic, align 4
-// AMDGCN: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] monotonic, align 4
-// AMDGCN: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] monotonic, align 4
-// AMDGCN: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden void @fi3a(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_DEF-NEXT: ret void
+//
+// AMDGCN_CL_20-LABEL: define hidden void @fi3a(
+// AMDGCN_CL_20-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_20-NEXT: ret void
+//
// SPIRV-LABEL: define hidden spir_func void @fi3a(
-// SPIRV: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4
-// SPIRV: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] monotonic, align 4
-// SPIRV: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] monotonic, align 4
-// SPIRV: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] monotonic, align 4
-// SPIRV: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] monotonic, align 4
-// SPIRV: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] monotonic, align 4
-// SPIRV: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] monotonic, align 4
-// SPIRV: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] monotonic, align 4
+// SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[F]], ptr [[F_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[H]], ptr [[H_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SPIRV-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
+// SPIRV-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
+// SPIRV-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// SPIRV-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
+// SPIRV-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
+// SPIRV-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
+// SPIRV-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
+// SPIRV-NEXT: ret void
+//
void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
*a = __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
*b = __scoped_atomic_fetch_sub(b, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
@@ -119,24 +778,357 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
*h = __scoped_atomic_fetch_max(h, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
}
-// AMDGCN-LABEL: define hidden void @fi3b(
-// AMDGCN: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("agent") monotonic, align 4
-// AMDGCN: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("agent") monotonic, align 4
-// AMDGCN: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("agent") monotonic, align 4
-// AMDGCN: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("agent") monotonic, align 4
-// AMDGCN: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("agent") monotonic, align 4
-// AMDGCN: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("agent") monotonic, align 4
-// AMDGCN: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("agent") monotonic, align 4
-// AMDGCN: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden void @fi3b(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_DEF-NEXT: ret void
+//
+// AMDGCN_CL_20-LABEL: define hidden void @fi3b(
+// AMDGCN_CL_20-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_20-NEXT: ret void
+//
// SPIRV-LABEL: define hidden spir_func void @fi3b(
-// SPIRV: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("device") monotonic, align 4
-// SPIRV: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("device") monotonic, align 4
-// SPIRV: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("device") monotonic, align 4
-// SPIRV: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("device") monotonic, align 4
-// SPIRV: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("device") monotonic, align 4
-// SPIRV: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("device") monotonic, align 4
-// SPIRV: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("device") monotonic, align 4
-// SPIRV: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("device") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[F]], ptr [[F_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[H]], ptr [[H_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SPIRV-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
+// SPIRV-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
+// SPIRV-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// SPIRV-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
+// SPIRV-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
+// SPIRV-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
+// SPIRV-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
+// SPIRV-NEXT: ret void
+//
void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
*a = __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
*b = __scoped_atomic_fetch_sub(b, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
@@ -148,24 +1140,357 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
*h = __scoped_atomic_fetch_max(h, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
}
-// AMDGCN-LABEL: define hidden void @fi3c(
-// AMDGCN: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden void @fi3c(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_DEF-NEXT: ret void
+//
+// AMDGCN_CL_20-LABEL: define hidden void @fi3c(
+// AMDGCN_CL_20-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_20-NEXT: ret void
+//
// SPIRV-LABEL: define hidden spir_func void @fi3c(
-// SPIRV: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("workgroup") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[F]], ptr [[F_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[H]], ptr [[H_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SPIRV-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
+// SPIRV-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
+// SPIRV-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// SPIRV-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
+// SPIRV-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
+// SPIRV-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
+// SPIRV-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
+// SPIRV-NEXT: ret void
+//
void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
*a = __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP);
*b = __scoped_atomic_fetch_sub(b, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP);
@@ -177,24 +1502,719 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
*h = __scoped_atomic_fetch_max(h, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP);
}
-// AMDGCN-LABEL: define hidden void @fi3d(
-// AMDGCN: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden void @fi3_clustr(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_DEF-NEXT: ret void
+//
+// AMDGCN_CL_20-LABEL: define hidden void @fi3_clustr(
+// AMDGCN_CL_20-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_20-NEXT: ret void
+//
+// SPIRV-LABEL: define hidden spir_func void @fi3_clustr(
+// SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[F]], ptr [[F_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[H]], ptr [[H_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SPIRV-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
+// SPIRV-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
+// SPIRV-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// SPIRV-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
+// SPIRV-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
+// SPIRV-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
+// SPIRV-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
+// SPIRV-NEXT: ret void
+//
+void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
+ *a = __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
+ *b = __scoped_atomic_fetch_sub(b, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
+ *c = __scoped_atomic_fetch_and(c, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
+ *d = __scoped_atomic_fetch_or(d, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
+ *e = __scoped_atomic_fetch_xor(e, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
+ *f = __scoped_atomic_fetch_nand(f, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
+ *g = __scoped_atomic_fetch_min(g, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
+ *h = __scoped_atomic_fetch_max(h, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
+}
+
+// AMDGCN_CL_DEF-LABEL: define hidden void @fi3d(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_DEF-NEXT: ret void
+//
+// AMDGCN_CL_20-LABEL: define hidden void @fi3d(
+// AMDGCN_CL_20-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_20-NEXT: ret void
+//
// SPIRV-LABEL: define hidden spir_func void @fi3d(
-// SPIRV: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("subgroup") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[F]], ptr [[F_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[H]], ptr [[H_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SPIRV-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
+// SPIRV-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
+// SPIRV-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// SPIRV-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
+// SPIRV-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
+// SPIRV-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
+// SPIRV-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
+// SPIRV-NEXT: ret void
+//
void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
*a = __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT);
*b = __scoped_atomic_fetch_sub(b, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT);
@@ -206,24 +2226,357 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
*h = __scoped_atomic_fetch_max(h, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT);
}
-// AMDGCN-LABEL: define hidden void @fi3e(
-// AMDGCN: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4
-// AMDGCN: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("singlethread") monotonic, align 4
-// AMDGCN: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("singlethread") monotonic, align 4
-// AMDGCN: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("singlethread") monotonic, align 4
-// AMDGCN: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("singlethread") monotonic, align 4
-// AMDGCN: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("singlethread") monotonic, align 4
-// AMDGCN: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("singlethread") monotonic, align 4
-// AMDGCN: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden void @fi3e(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_DEF-NEXT: ret void
+//
+// AMDGCN_CL_20-LABEL: define hidden void @fi3e(
+// AMDGCN_CL_20-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_20-NEXT: ret void
+//
// SPIRV-LABEL: define hidden spir_func void @fi3e(
-// SPIRV: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4
-// SPIRV: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("singlethread") monotonic, align 4
-// SPIRV: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("singlethread") monotonic, align 4
-// SPIRV: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("singlethread") monotonic, align 4
-// SPIRV: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("singlethread") monotonic, align 4
-// SPIRV: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("singlethread") monotonic, align 4
-// SPIRV: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("singlethread") monotonic, align 4
-// SPIRV: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("singlethread") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[F]], ptr [[F_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[H]], ptr [[H_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SPIRV-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4
+// SPIRV-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4
+// SPIRV-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4
+// SPIRV-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4
+// SPIRV-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4
+// SPIRV-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4
+// SPIRV-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4
+// SPIRV-NEXT: ret void
+//
void fi3e(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
*a = __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE);
*b = __scoped_atomic_fetch_sub(b, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE);
@@ -235,10 +2588,98 @@ void fi3e(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
*h = __scoped_atomic_fetch_max(h, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE);
}
-// AMDGCN-LABEL: define hidden zeroext i1 @fi4a(
-// AMDGCN-DAG: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi4a(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi4a(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] acquire acquire, align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+// AMDGCN_CL_20-NEXT: br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1
+// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
+//
// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4a(
-// SPIRV-DAG: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DESIRED:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT: store i32 1, ptr [[DESIRED]], align 4
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV: [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// SPIRV: [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT: ret i1 [[LOADEDV]]
+//
_Bool fi4a(int *i) {
int cmp = 0;
int desired = 1;
@@ -247,10 +2688,98 @@ _Bool fi4a(int *i) {
__MEMORY_SCOPE_SYSTEM);
}
-// AMDGCN-LABEL: define hidden zeroext i1 @fi4b(
-// AMDGCN-DAG: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("agent") acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi4b(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("agent") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi4b(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] syncscope("agent") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+// AMDGCN_CL_20-NEXT: br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1
+// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
+//
// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4b(
-// SPIRV-DAG: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("device") acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DESIRED:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT: store i32 1, ptr [[DESIRED]], align 4
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("device") acquire acquire, align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV: [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// SPIRV: [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT: ret i1 [[LOADEDV]]
+//
_Bool fi4b(int *i) {
int cmp = 0;
int desired = 1;
@@ -259,10 +2788,98 @@ _Bool fi4b(int *i) {
__MEMORY_SCOPE_DEVICE);
}
-// AMDGCN-LABEL: define hidden zeroext i1 @fi4c(
-// AMDGCN: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi4c(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi4c(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] syncscope("workgroup") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+// AMDGCN_CL_20-NEXT: br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1
+// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
+//
// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4c(
-// SPIRV: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DESIRED:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT: store i32 1, ptr [[DESIRED]], align 4
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV: [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// SPIRV: [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT: ret i1 [[LOADEDV]]
+//
_Bool fi4c(int *i) {
int cmp = 0;
int desired = 1;
@@ -271,10 +2888,198 @@ _Bool fi4c(int *i) {
__MEMORY_SCOPE_WRKGRP);
}
-// AMDGCN-LABEL: define hidden zeroext i1 @fi4d(
-// AMDGCN: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("wavefront") acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi4_clustr(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("cluster") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi4_clustr(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] syncscope("cluster") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+// AMDGCN_CL_20-NEXT: br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1
+// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
+//
+// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4_clustr(
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DESIRED:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT: store i32 1, ptr [[DESIRED]], align 4
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV: [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// SPIRV: [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT: ret i1 [[LOADEDV]]
+//
+_Bool fi4_clustr(int *i) {
+ int cmp = 0;
+ int desired = 1;
+ return __scoped_atomic_compare_exchange(i, &cmp, &desired, 0,
+ __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
+ __MEMORY_SCOPE_CLUSTR);
+}
+
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi4d(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("wavefront") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi4d(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] syncscope("wavefront") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+// AMDGCN_CL_20-NEXT: br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1
+// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
+//
// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4d(
-// SPIRV: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("subgroup") acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DESIRED:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT: store i32 1, ptr [[DESIRED]], align 4
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("subgroup") acquire acquire, align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV: [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// SPIRV: [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT: ret i1 [[LOADEDV]]
+//
_Bool fi4d(int *i) {
int cmp = 0;
int desired = 1;
@@ -283,10 +3088,98 @@ _Bool fi4d(int *i) {
__MEMORY_SCOPE_WVFRNT);
}
-// AMDGCN-LABEL: define hidden zeroext i1 @fi4e(
-// AMDGCN: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi4e(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi4e(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] syncscope("singlethread") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+// AMDGCN_CL_20-NEXT: br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1
+// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
+//
// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4e(
-// SPIRV: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DESIRED:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT: store i32 1, ptr [[DESIRED]], align 4
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV: [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// SPIRV: [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT: ret i1 [[LOADEDV]]
+//
_Bool fi4e(int *i) {
int cmp = 0;
int desired = 1;
@@ -295,10 +3188,98 @@ _Bool fi4e(int *i) {
__MEMORY_SCOPE_SINGLE);
}
-// AMDGCN-LABEL: define hidden zeroext i1 @fi5a(
-// AMDGCN: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi5a(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi5a(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_20-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
+//
// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5a(
-// SPIRV: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV: [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// SPIRV: [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT: ret i1 [[LOADEDV]]
+//
_Bool fi5a(int *i) {
int cmp = 0;
return __scoped_atomic_compare_exchange_n(i, &cmp, 1, 1, __ATOMIC_ACQUIRE,
@@ -306,10 +3287,98 @@ _Bool fi5a(int *i) {
__MEMORY_SCOPE_SYSTEM);
}
-// AMDGCN-LABEL: define hidden zeroext i1 @fi5b(
-// AMDGCN: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("agent") acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi5b(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("agent") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi5b(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("agent") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_20-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
+//
// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5b(
-// SPIRV: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("device") acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("device") acquire acquire, align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV: [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// SPIRV: [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT: ret i1 [[LOADEDV]]
+//
_Bool fi5b(int *i) {
int cmp = 0;
return __scoped_atomic_compare_exchange_n(i, &cmp, 1, 1, __ATOMIC_ACQUIRE,
@@ -317,127 +3386,1161 @@ _Bool fi5b(int *i) {
__MEMORY_SCOPE_DEVICE);
}
-// AMDGCN-LABEL: define hidden zeroext i1 @fi5c(
-// AMDGCN: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi5c(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi5c(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_20-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
+//
// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5c(
-// SPIRV: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV: [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// SPIRV: [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT: ret i1 [[LOADEDV]]
+//
_Bool fi5c(int *i) {
int cmp = 0;
return __scoped_atomic_compare_exchange_n(
i, &cmp, 1, 1, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_WRKGRP);
}
-// AMDGCN-LABEL: define hidden zeroext i1 @fi5d(
-// AMDGCN: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("wavefront") acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi5_clustr(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("cluster") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi5_clustr(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("cluster") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_20-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
+//
+// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5_clustr(
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV: [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// SPIRV: [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT: ret i1 [[LOADEDV]]
+//
+_Bool fi5_clustr(int *i) {
+ int cmp = 0;
+ return __scoped_atomic_compare_exchange_n(
+ i, &cmp, 1, 1, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_CLUSTR);
+}
+
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi5d(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("wavefront") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi5d(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("wavefront") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_20-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
+//
// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5d(
-// SPIRV: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("subgroup") acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("subgroup") acquire acquire, align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV: [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// SPIRV: [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT: ret i1 [[LOADEDV]]
+//
_Bool fi5d(int *i) {
int cmp = 0;
return __scoped_atomic_compare_exchange_n(
i, &cmp, 1, 1, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_WVFRNT);
}
-// AMDGCN-LABEL: define hidden zeroext i1 @fi5e(
-// AMDGCN: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi5e(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi5e(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_20-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
+//
// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5e(
-// SPIRV: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV: [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]]
+// SPIRV: [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT: ret i1 [[LOADEDV]]
+//
_Bool fi5e(int *i) {
int cmp = 0;
return __scoped_atomic_compare_exchange_n(
i, &cmp, 1, 1, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_SINGLE);
}
-// AMDGCN-LABEL: define hidden i32 @fi6a(
-// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi6a(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: ret i32 [[TMP4]]
+//
+// AMDGCN_CL_20-LABEL: define hidden i32 @fi6a(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr [[TMP2]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4
+// AMDGCN_CL_20-NEXT: ret i32 [[TMP5]]
+//
// SPIRV-LABEL: define hidden spir_func i32 @fi6a(
-// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4
+// SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[RET:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP3]], ptr [[RET]], align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET]], align 4
+// SPIRV-NEXT: ret i32 [[TMP4]]
+//
int fi6a(int *c, int *d) {
int ret;
__scoped_atomic_exchange(c, d, &ret, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
return ret;
}
-// AMDGCN-LABEL: define hidden i32 @fi6b(
-// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi6b(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: ret i32 [[TMP4]]
+//
+// AMDGCN_CL_20-LABEL: define hidden i32 @fi6b(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr [[TMP2]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4
+// AMDGCN_CL_20-NEXT: ret i32 [[TMP5]]
+//
// SPIRV-LABEL: define hidden spir_func i32 @fi6b(
-// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("device") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[RET:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP3]], ptr [[RET]], align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET]], align 4
+// SPIRV-NEXT: ret i32 [[TMP4]]
+//
int fi6b(int *c, int *d) {
int ret;
__scoped_atomic_exchange(c, d, &ret, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
return ret;
}
-// AMDGCN-LABEL: define hidden i32 @fi6c(
-// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi6c(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: ret i32 [[TMP4]]
+//
+// AMDGCN_CL_20-LABEL: define hidden i32 @fi6c(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr [[TMP2]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4
+// AMDGCN_CL_20-NEXT: ret i32 [[TMP5]]
+//
// SPIRV-LABEL: define hidden spir_func i32 @fi6c(
-// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[RET:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP3]], ptr [[RET]], align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET]], align 4
+// SPIRV-NEXT: ret i32 [[TMP4]]
+//
int fi6c(int *c, int *d) {
int ret;
__scoped_atomic_exchange(c, d, &ret, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP);
return ret;
}
-// AMDGCN-LABEL: define hidden i32 @fi6d(
-// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi6_clustr(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: ret i32 [[TMP4]]
+//
+// AMDGCN_CL_20-LABEL: define hidden i32 @fi6_clustr(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr [[TMP2]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4
+// AMDGCN_CL_20-NEXT: ret i32 [[TMP5]]
+//
+// SPIRV-LABEL: define hidden spir_func i32 @fi6_clustr(
+// SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[RET:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP3]], ptr [[RET]], align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET]], align 4
+// SPIRV-NEXT: ret i32 [[TMP4]]
+//
+int fi6_clustr(int *c, int *d) {
+ int ret;
+ __scoped_atomic_exchange(c, d, &ret, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
+ return ret;
+}
+
+// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi6d(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: ret i32 [[TMP4]]
+//
+// AMDGCN_CL_20-LABEL: define hidden i32 @fi6d(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr [[TMP2]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4
+// AMDGCN_CL_20-NEXT: ret i32 [[TMP5]]
+//
// SPIRV-LABEL: define hidden spir_func i32 @fi6d(
-// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("subgroup") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[RET:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP3]], ptr [[RET]], align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET]], align 4
+// SPIRV-NEXT: ret i32 [[TMP4]]
+//
int fi6d(int *c, int *d) {
int ret;
__scoped_atomic_exchange(c, d, &ret, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT);
return ret;
}
-// AMDGCN-LABEL: define hidden i32 @fi6e(
-// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi6e(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT: ret i32 [[TMP4]]
+//
+// AMDGCN_CL_20-LABEL: define hidden i32 @fi6e(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr [[TMP2]], align 4
+// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4
+// AMDGCN_CL_20-NEXT: ret i32 [[TMP5]]
+//
// SPIRV-LABEL: define hidden spir_func i32 @fi6e(
-// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[RET:%.*]] = alloca i32, align 4
+// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// SPIRV-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT: store i32 [[TMP3]], ptr [[RET]], align 4
+// SPIRV-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET]], align 4
+// SPIRV-NEXT: ret i32 [[TMP4]]
+//
int fi6e(int *c, int *d) {
int ret;
__scoped_atomic_exchange(c, d, &ret, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE);
return ret;
}
-// AMDGCN-LABEL: define hidden zeroext i1 @fi7a(
-// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] monotonic, align 1
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi7a(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi7a(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
+//
// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7a(
-// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] monotonic, align 1
+// SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store i8 1, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1
+// SPIRV-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// SPIRV-NEXT: ret i1 [[LOADEDV]]
+//
_Bool fi7a(_Bool *c) {
return __scoped_atomic_exchange_n(c, 1, __ATOMIC_RELAXED,
__MEMORY_SCOPE_SYSTEM);
}
-// AMDGCN-LABEL: define hidden zeroext i1 @fi7b(
-// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("agent") monotonic, align 1
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi7b(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("agent") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi7b(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("agent") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
+//
// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7b(
-// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("device") monotonic, align 1
+// SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store i8 1, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("device") monotonic, align 1
+// SPIRV-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// SPIRV-NEXT: ret i1 [[LOADEDV]]
+//
_Bool fi7b(_Bool *c) {
return __scoped_atomic_exchange_n(c, 1, __ATOMIC_RELAXED,
__MEMORY_SCOPE_DEVICE);
}
-// AMDGCN-LABEL: define hidden zeroext i1 @fi7c(
-// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("workgroup") monotonic, align 1
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi7c(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi7c(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
+//
// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7c(
-// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("workgroup") monotonic, align 1
+// SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store i8 1, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1
+// SPIRV-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// SPIRV-NEXT: ret i1 [[LOADEDV]]
+//
_Bool fi7c(_Bool *c) {
return __scoped_atomic_exchange_n(c, 1, __ATOMIC_RELAXED,
__MEMORY_SCOPE_WRKGRP);
}
-// AMDGCN-LABEL: define hidden zeroext i1 @fi7d(
-// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("wavefront") monotonic, align 1
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi7_clustr(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("cluster") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi7_clustr(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("cluster") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
+//
+// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7_clustr(
+// SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store i8 1, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1
+// SPIRV-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// SPIRV-NEXT: ret i1 [[LOADEDV]]
+//
+_Bool fi7_clustr(_Bool *c) {
+ return __scoped_atomic_exchange_n(c, 1, __ATOMIC_RELAXED,
+ __MEMORY_SCOPE_CLUSTR);
+}
+
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi7d(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("wavefront") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi7d(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("wavefront") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
+//
// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7d(
-// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("subgroup") monotonic, align 1
+// SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store i8 1, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("subgroup") monotonic, align 1
+// SPIRV-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// SPIRV-NEXT: ret i1 [[LOADEDV]]
+//
_Bool fi7d(_Bool *c) {
return __scoped_atomic_exchange_n(c, 1, __ATOMIC_RELAXED,
__MEMORY_SCOPE_WVFRNT);
}
-// AMDGCN-LABEL: define hidden zeroext i1 @fi7e(
-// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("singlethread") monotonic, align 1
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi7e(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi7e(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]]
+//
// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7e(
-// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("singlethread") monotonic, align 1
+// SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT: [[ENTRY:.*:]]
+// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT: store i8 1, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1
+// SPIRV-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// SPIRV-NEXT: ret i1 [[LOADEDV]]
+//
_Bool fi7e(_Bool *c) {
return __scoped_atomic_exchange_n(c, 1, __ATOMIC_RELAXED,
__MEMORY_SCOPE_SINGLE);
}
+//.
+// AMDGCN_CL_DEF: [[META3]] = !{}
+//.
+// AMDGCN_CL_20: [[META4]] = !{}
+//.
diff --git a/clang/test/CodeGen/scoped-fence-ops.c b/clang/test/CodeGen/scoped-fence-ops.c
index d83ae05b0aea2..259e8d333e4c8 100644
--- a/clang/test/CodeGen/scoped-fence-ops.c
+++ b/clang/test/CodeGen/scoped-fence-ops.c
@@ -1,8 +1,8 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
// RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
-// RUN: -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
+// RUN: -fvisibility=hidden | FileCheck --check-prefixes=AMDGCN,AMDGCN_CL_DEF %s
// RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
-// RUN: -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
+// RUN: -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefixes=AMDGCN,AMDGCN_CL_20 %s
// RUN: %clang_cc1 %s -emit-llvm -o - -triple=spirv64-unknown-unknown -ffreestanding \
// RUN: -fvisibility=hidden | FileCheck --check-prefix=SPIRV %s
// RUN: %clang_cc1 %s -emit-llvm -o - -triple=x86_64-unknown-linux-gnu -ffreestanding \
@@ -127,23 +127,27 @@ void fe1b(int ord) {
// AMDGCN-NEXT: store i32 [[SCOPE]], ptr [[SCOPE_ADDR_ASCAST]], align 4
// AMDGCN-NEXT: [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR_ASCAST]], align 4
// AMDGCN-NEXT: switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
-// AMDGCN-NEXT: i32 1, label %[[DEVICE_SCOPE:.*]]
// AMDGCN-NEXT: i32 0, label %[[SYSTEM_SCOPE:.*]]
+// AMDGCN-NEXT: i32 1, label %[[DEVICE_SCOPE:.*]]
// AMDGCN-NEXT: i32 2, label %[[WORKGROUP_SCOPE:.*]]
+// AMDGCN-NEXT: i32 5, label %[[CLUSTER_SCOPE:.*]]
// AMDGCN-NEXT: i32 3, label %[[WAVEFRONT_SCOPE:.*]]
// AMDGCN-NEXT: i32 4, label %[[SINGLE_SCOPE:.*]]
// AMDGCN-NEXT: ]
// AMDGCN: [[ATOMIC_SCOPE_CONTINUE]]:
// AMDGCN-NEXT: ret void
-// AMDGCN: [[DEVICE_SCOPE]]:
-// AMDGCN-NEXT: fence syncscope("agent") release
-// AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]]
// AMDGCN: [[SYSTEM_SCOPE]]:
// AMDGCN-NEXT: fence release
// AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN: [[DEVICE_SCOPE]]:
+// AMDGCN-NEXT: fence syncscope("agent") release
+// AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]]
// AMDGCN: [[WORKGROUP_SCOPE]]:
// AMDGCN-NEXT: fence syncscope("workgroup") release
// AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN: [[CLUSTER_SCOPE]]:
+// AMDGCN-NEXT: fence syncscope("cluster") release
+// AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]]
// AMDGCN: [[WAVEFRONT_SCOPE]]:
// AMDGCN-NEXT: fence syncscope("wavefront") release
// AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]]
@@ -158,23 +162,27 @@ void fe1b(int ord) {
// SPIRV-NEXT: store i32 [[SCOPE]], ptr [[SCOPE_ADDR]], align 4
// SPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR]], align 4
// SPIRV-NEXT: switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
-// SPIRV-NEXT: i32 1, label %[[DEVICE_SCOPE:.*]]
// SPIRV-NEXT: i32 0, label %[[SYSTEM_SCOPE:.*]]
+// SPIRV-NEXT: i32 1, label %[[DEVICE_SCOPE:.*]]
// SPIRV-NEXT: i32 2, label %[[WORKGROUP_SCOPE:.*]]
+// SPIRV-NEXT: i32 5, label %[[CLUSTER_SCOPE:.*]]
// SPIRV-NEXT: i32 3, label %[[WAVEFRONT_SCOPE:.*]]
// SPIRV-NEXT: i32 4, label %[[SINGLE_SCOPE:.*]]
// SPIRV-NEXT: ]
// SPIRV: [[ATOMIC_SCOPE_CONTINUE]]:
// SPIRV-NEXT: ret void
-// SPIRV: [[DEVICE_SCOPE]]:
-// SPIRV-NEXT: fence syncscope("device") release
-// SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]]
// SPIRV: [[SYSTEM_SCOPE]]:
// SPIRV-NEXT: fence release
// SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]]
+// SPIRV: [[DEVICE_SCOPE]]:
+// SPIRV-NEXT: fence syncscope("device") release
+// SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]]
// SPIRV: [[WORKGROUP_SCOPE]]:
// SPIRV-NEXT: fence syncscope("workgroup") release
// SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]]
+// SPIRV: [[CLUSTER_SCOPE]]:
+// SPIRV-NEXT: fence syncscope("workgroup") release
+// SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]]
// SPIRV: [[WAVEFRONT_SCOPE]]:
// SPIRV-NEXT: fence syncscope("subgroup") release
// SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]]
@@ -189,23 +197,27 @@ void fe1b(int ord) {
// X86_64-NEXT: store i32 [[SCOPE]], ptr [[SCOPE_ADDR]], align 4
// X86_64-NEXT: [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR]], align 4
// X86_64-NEXT: switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
-// X86_64-NEXT: i32 1, label %[[DEVICE_SCOPE:.*]]
// X86_64-NEXT: i32 0, label %[[SYSTEM_SCOPE:.*]]
+// X86_64-NEXT: i32 1, label %[[DEVICE_SCOPE:.*]]
// X86_64-NEXT: i32 2, label %[[WORKGROUP_SCOPE:.*]]
+// X86_64-NEXT: i32 5, label %[[CLUSTER_SCOPE:.*]]
// X86_64-NEXT: i32 3, label %[[WAVEFRONT_SCOPE:.*]]
// X86_64-NEXT: i32 4, label %[[SINGLE_SCOPE:.*]]
// X86_64-NEXT: ]
// X86_64: [[ATOMIC_SCOPE_CONTINUE]]:
// X86_64-NEXT: ret void
-// X86_64: [[DEVICE_SCOPE]]:
+// X86_64: [[SYSTEM_SCOPE]]:
// X86_64-NEXT: fence release
// X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]]
-// X86_64: [[SYSTEM_SCOPE]]:
+// X86_64: [[DEVICE_SCOPE]]:
// X86_64-NEXT: fence release
// X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]]
// X86_64: [[WORKGROUP_SCOPE]]:
// X86_64-NEXT: fence release
// X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]]
+// X86_64: [[CLUSTER_SCOPE]]:
+// X86_64-NEXT: fence release
+// X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]]
// X86_64: [[WAVEFRONT_SCOPE]]:
// X86_64-NEXT: fence release
// X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]]
@@ -257,3 +269,6 @@ void fe2a() {
void fe2b() {
__scoped_atomic_thread_fence(__ATOMIC_RELEASE, 999);
}
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// AMDGCN_CL_20: {{.*}}
+// AMDGCN_CL_DEF: {{.*}}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index 6bb20bff436fb..faf6a7d44fee2 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -5,6 +5,8 @@
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,AMDGCNSPIRV %s
+#define INVALID_MEMORY_SCOPE (__MEMORY_SCOPE_CLUSTR+1)
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
typedef unsigned long ulong;
@@ -252,13 +254,19 @@ void test_update_dpp_const_int(global int* out, int arg1)
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
-// GCN: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// GCN: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
// AMDGCNSPIRV: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("device") monotonic, align 4{{$}}
-// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
-// GCN: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+
+// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
+
+// GCN: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("cluster") monotonic, align 4{{$}}
+// AMDGCNSPIRV: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
+
+// GCN: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
// AMDGCNSPIRV: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("subgroup") monotonic, align 4{{$}}
-// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
-// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+
+// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
+// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
#if !defined(__SPIRV__)
void test_ds_faddf(local float *out, float src) {
#else
@@ -279,9 +287,10 @@ void test_ds_faddf(local float *out, float src) {
// Test all syncscopes.
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false);
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false);
+ *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR, false);
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false);
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false);
- *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, 5, false); // invalid
+ *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, INVALID_MEMORY_SCOPE, false); // invalid
}
// CHECK-LABEL: @test_ds_fmin
@@ -295,13 +304,19 @@ void test_ds_faddf(local float *out, float src) {
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
-// GCN: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// GCN: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
// AMDGCNSPIRV: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("device") monotonic, align 4{{$}}
-// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
-// GCN: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
+
+// GCN: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("cluster") monotonic, align 4{{$}}
+// AMDGCNSPIRV: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
+
+// GCN: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
// AMDGCNSPIRV: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("subgroup") monotonic, align 4{{$}}
-// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
-// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
+// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
#if !defined(__SPIRV__)
void test_ds_fminf(local float *out, float src) {
@@ -322,9 +337,10 @@ void test_ds_fminf(__attribute__((address_space(3))) float *out, float src) {
// Test all syncscopes.
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false);
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false);
+ *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR, false);
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false);
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false);
- *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, 5, false); // invalid
+ *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, INVALID_MEMORY_SCOPE, false); // invalid
}
// CHECK-LABEL: @test_ds_fmax
@@ -338,13 +354,19 @@ void test_ds_fminf(__attribute__((address_space(3))) float *out, float src) {
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
-// GCN: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// GCN: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
// AMDGCNSPIRV: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("device") monotonic, align 4{{$}}
-// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
-// GCN: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
+
+// GCN: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("cluster") monotonic, align 4{{$}}
+// AMDGCNSPIRV: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
+
+// GCN: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
// AMDGCNSPIRV: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("subgroup") monotonic, align 4{{$}}
-// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
-// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
+// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
#if !defined(__SPIRV__)
void test_ds_fmaxf(local float *out, float src) {
@@ -365,9 +387,10 @@ void test_ds_fmaxf(__attribute__((address_space(3))) float *out, float src) {
// Test all syncscopes.
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false);
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false);
+ *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR, false);
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false);
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false);
- *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, 5, false); // invalid
+ *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, INVALID_MEMORY_SCOPE, false); // invalid
}
// CHECK-LABEL: @test_s_memtime
diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c
index 3036b496db25d..460778f39d003 100644
--- a/clang/test/Preprocessor/init-aarch64.c
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -234,6 +234,7 @@
// AARCH64-NEXT: #define __LONG_MAX__ 9223372036854775807L
// AARCH64-NEXT: #define __LONG_WIDTH__ 64
// AARCH64-NEXT: #define __LP64__ 1
+// AARCH64-NEXT: #define __MEMORY_SCOPE_CLUSTR 5
// AARCH64-NEXT: #define __MEMORY_SCOPE_DEVICE 1
// AARCH64-NEXT: #define __MEMORY_SCOPE_SINGLE 4
// AARCH64-NEXT: #define __MEMORY_SCOPE_SYSTEM 0
@@ -989,6 +990,7 @@
// ARM64EC-MSVC: #define __LONG_LONG_MAX__ 9223372036854775807LL
// ARM64EC-MSVC: #define __LONG_MAX__ 2147483647L
// ARM64EC-MSVC: #define __LONG_WIDTH__ 32
+// ARM64EC-MSVC: #define __MEMORY_SCOPE_CLUSTR 5
// ARM64EC-MSVC: #define __MEMORY_SCOPE_DEVICE 1
// ARM64EC-MSVC: #define __MEMORY_SCOPE_SINGLE 4
// ARM64EC-MSVC: #define __MEMORY_SCOPE_SYSTEM 0
diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c
index 71a266b8a9157..fd7ce2073a512 100644
--- a/clang/test/Preprocessor/init-loongarch.c
+++ b/clang/test/Preprocessor/init-loongarch.c
@@ -182,11 +182,12 @@
// LA32: #define __LONG_LONG_MAX__ 9223372036854775807LL
// LA32: #define __LONG_MAX__ 2147483647L
// LA32: #define __LONG_WIDTH__ 32
-// LA32: #define __MEMORY_SCOPE_DEVICE 1
-// LA32: #define __MEMORY_SCOPE_SINGLE 4
-// LA32: #define __MEMORY_SCOPE_SYSTEM 0
-// LA32: #define __MEMORY_SCOPE_WRKGRP 2
-// LA32: #define __MEMORY_SCOPE_WVFRNT 3
+// LA32: #define __MEMORY_SCOPE_CLUSTR 5
+// LA32: #define __MEMORY_SCOPE_DEVICE 1
+// LA32: #define __MEMORY_SCOPE_SINGLE 4
+// LA32: #define __MEMORY_SCOPE_SYSTEM 0
+// LA32: #define __MEMORY_SCOPE_WRKGRP 2
+// LA32: #define __MEMORY_SCOPE_WVFRNT 3
// LA32: #define __NO_INLINE__ 1
// LA32: #define __NO_MATH_ERRNO__ 1
// LA32: #define __OBJC_BOOL_IS_BOOL 0
@@ -514,11 +515,12 @@
// LA64: #define __LONG_MAX__ 9223372036854775807L
// LA64: #define __LONG_WIDTH__ 64
// LA64: #define __LP64__ 1
-// LA64: #define __MEMORY_SCOPE_DEVICE 1
-// LA64: #define __MEMORY_SCOPE_SINGLE 4
-// LA64: #define __MEMORY_SCOPE_SYSTEM 0
-// LA64: #define __MEMORY_SCOPE_WRKGRP 2
-// LA64: #define __MEMORY_SCOPE_WVFRNT 3
+// LA64: #define __MEMORY_SCOPE_CLUSTR 5
+// LA64: #define __MEMORY_SCOPE_DEVICE 1
+// LA64: #define __MEMORY_SCOPE_SINGLE 4
+// LA64: #define __MEMORY_SCOPE_SYSTEM 0
+// LA64: #define __MEMORY_SCOPE_WRKGRP 2
+// LA64: #define __MEMORY_SCOPE_WVFRNT 3
// LA64: #define __NO_INLINE__ 1
// LA64: #define __NO_MATH_ERRNO__ 1
// LA64: #define __OBJC_BOOL_IS_BOOL 0
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index 7e0df96141364..4dea1b583a089 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -1889,6 +1889,7 @@
// WEBASSEMBLY64-NEXT:#define __LONG_MAX__ 9223372036854775807L
// WEBASSEMBLY64-NEXT:#define __LONG_WIDTH__ 64
// WEBASSEMBLY64-NEXT:#define __LP64__ 1
+// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_CLUSTR 5
// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_DEVICE 1
// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_SINGLE 4
// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_SYSTEM 0
@@ -2216,6 +2217,7 @@
// AVR:#define __LDBL_MIN__ 1.17549435e-38L
// AVR:#define __LONG_LONG_MAX__ 9223372036854775807LL
// AVR:#define __LONG_MAX__ 2147483647L
+// AVR:#define __MEMORY_SCOPE_CLUSTR 5
// AVR:#define __MEMORY_SCOPE_DEVICE 1
// AVR:#define __MEMORY_SCOPE_SINGLE 4
// AVR:#define __MEMORY_SCOPE_SYSTEM 0
@@ -2521,6 +2523,7 @@
// RISCV32: #define __LITTLE_ENDIAN__ 1
// RISCV32: #define __LONG_LONG_MAX__ 9223372036854775807LL
// RISCV32: #define __LONG_MAX__ 2147483647L
+// RISCV32: #define __MEMORY_SCOPE_CLUSTR 5
// RISCV32: #define __MEMORY_SCOPE_DEVICE 1
// RISCV32: #define __MEMORY_SCOPE_SINGLE 4
// RISCV32: #define __MEMORY_SCOPE_SYSTEM 0
@@ -2745,6 +2748,7 @@
// RISCV64: #define __LONG_LONG_MAX__ 9223372036854775807LL
// RISCV64: #define __LONG_MAX__ 9223372036854775807L
// RISCV64: #define __LP64__ 1
+// RISCV64: #define __MEMORY_SCOPE_CLUSTR 5
// RISCV64: #define __MEMORY_SCOPE_DEVICE 1
// RISCV64: #define __MEMORY_SCOPE_SINGLE 4
// RISCV64: #define __MEMORY_SCOPE_SYSTEM 0
@@ -2937,11 +2941,11 @@
// XTENSA: #define __GXX_ABI_VERSION {{.*}}
// XTENSA: #define __ILP32__ 1
// XTENSA: #define __INT16_C(c) c
-// XTENSA: #define __INT16_C_SUFFIX__
+// XTENSA: #define __INT16_C_SUFFIX__
// XTENSA: #define __INT16_MAX__ 32767
// XTENSA: #define __INT16_TYPE__ short
// XTENSA: #define __INT32_C(c) c
-// XTENSA: #define __INT32_C_SUFFIX__
+// XTENSA: #define __INT32_C_SUFFIX__
// XTENSA: #define __INT32_MAX__ 2147483647
// XTENSA: #define __INT32_TYPE__ int
// XTENSA: #define __INT64_C(c) c##LL
@@ -2949,7 +2953,7 @@
// XTENSA: #define __INT64_MAX__ 9223372036854775807LL
// XTENSA: #define __INT64_TYPE__ long long int
// XTENSA: #define __INT8_C(c) c
-// XTENSA: #define __INT8_C_SUFFIX__
+// XTENSA: #define __INT8_C_SUFFIX__
// XTENSA: #define __INT8_MAX__ 127
// XTENSA: #define __INT8_TYPE__ signed char
// XTENSA: #define __INTMAX_C(c) c##LL
@@ -3008,6 +3012,7 @@
// XTENSA: #define __LONG_LONG_MAX__ 9223372036854775807LL
// XTENSA: #define __LONG_MAX__ 2147483647L
// XTENSA: #define __LONG_WIDTH__ 32
+// XTENSA: #define __MEMORY_SCOPE_CLUSTR 5
// XTENSA: #define __MEMORY_SCOPE_DEVICE 1
// XTENSA: #define __MEMORY_SCOPE_SINGLE 4
// XTENSA: #define __MEMORY_SCOPE_SYSTEM 0
@@ -3050,7 +3055,7 @@
// XTENSA: #define __STDC_VERSION__ 201710L
// XTENSA: #define __STDC__ 1
// XTENSA: #define __UINT16_C(c) c
-// XTENSA: #define __UINT16_C_SUFFIX__
+// XTENSA: #define __UINT16_C_SUFFIX__
// XTENSA: #define __UINT16_MAX__ 65535
// XTENSA: #define __UINT16_TYPE__ unsigned short
// XTENSA: #define __UINT32_C(c) c##U
@@ -3062,7 +3067,7 @@
// XTENSA: #define __UINT64_MAX__ 18446744073709551615ULL
// XTENSA: #define __UINT64_TYPE__ long long unsigned int
// XTENSA: #define __UINT8_C(c) c
-// XTENSA: #define __UINT8_C_SUFFIX__
+// XTENSA: #define __UINT8_C_SUFFIX__
// XTENSA: #define __UINT8_MAX__ 255
// XTENSA: #define __UINT8_TYPE__ unsigned char
// XTENSA: #define __UINTMAX_C(c) c##ULL
@@ -3089,7 +3094,7 @@
// XTENSA: #define __UINT_LEAST64_TYPE__ long long unsigned int
// XTENSA: #define __UINT_LEAST8_MAX__ 255
// XTENSA: #define __UINT_LEAST8_TYPE__ unsigned char
-// XTENSA: #define __USER_LABEL_PREFIX__
+// XTENSA: #define __USER_LABEL_PREFIX__
// XTENSA: #define __WCHAR_MAX__ 2147483647
// XTENSA: #define __WCHAR_TYPE__ int
// XTENSA: #define __WCHAR_WIDTH__ 32
diff --git a/clang/test/SemaCUDA/atomic-ops.cu b/clang/test/SemaCUDA/atomic-ops.cu
index 233ed1c10fc11..40e110c4b9b77 100644
--- a/clang/test/SemaCUDA/atomic-ops.cu
+++ b/clang/test/SemaCUDA/atomic-ops.cu
@@ -2,6 +2,8 @@
#include "Inputs/cuda.h"
+#define INVALID_HIP_MEMORY_SCOPE (__HIP_MEMORY_SCOPE_CLUSTER+1)
+
__device__ int test_hip_atomic_load(int *pi32, unsigned int *pu32, long long *pll, unsigned long long *pull, float *fp, double *dbl) {
int val = __hip_atomic_load(0); // expected-error {{too few arguments to function call, expected 3, have 1}}
val = __hip_atomic_load(0, 0, 0, 0); // expected-error {{too many arguments to function call, expected 3, have 4}}
@@ -10,9 +12,10 @@ __device__ int test_hip_atomic_load(int *pi32, unsigned int *pu32, long long *pl
val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
+ val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_CLUSTER);
val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
- val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, 6); // expected-error {{synchronization scope argument to atomic operation is invalid}}
+ val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, INVALID_HIP_MEMORY_SCOPE); // expected-error {{synchronization scope argument to atomic operation is invalid}}
val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
val = __hip_atomic_load(pi32, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD);
val = __hip_atomic_load(pi32, __ATOMIC_CONSUME, __HIP_MEMORY_SCOPE_SINGLETHREAD);
@@ -35,9 +38,10 @@ __device__ int test_hip_atomic_store(int *pi32, unsigned int *pu32, long long *p
__hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
__hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
__hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
+ __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_CLUSTER);
__hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
__hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
- __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, 6); // expected-error {{synchronization scope argument to atomic operation is invalid}}
+ __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, INVALID_HIP_MEMORY_SCOPE); // expected-error {{synchronization scope argument to atomic operation is invalid}}
__hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
__hip_atomic_store(pi32, 0, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD);
__hip_atomic_store(pi32, 0, __ATOMIC_CONSUME, __HIP_MEMORY_SCOPE_SINGLETHREAD); // expected-warning{{memory order argument to atomic operation is invalid}}
@@ -71,6 +75,7 @@ __device__ bool test_hip_atomic_cmpxchg_weak(int *ptr, int val, int desired) {
flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_CONSUME, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
+ flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_CLUSTER);
flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD);
diff --git a/clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu b/clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu
index ea1f24670ff9a..503e786877819 100644
--- a/clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu
+++ b/clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu
@@ -2,6 +2,8 @@
#include "Inputs/cuda.h"
+#define INVALID_HIP_MEMORY_SCOPE (__HIP_MEMORY_SCOPE_CLUSTER+1)
+
__device__ int test_hip_atomic_load(int *pi32, unsigned int *pu32, long long *pll, unsigned long long *pull, float *fp, double *dbl) {
int val = __hip_atomic_load(0); // expected-error {{too few arguments to function call, expected 3, have 1}}
val = __hip_atomic_load(0, 0, 0, 0); // expected-error {{too many arguments to function call, expected 3, have 4}}
@@ -10,9 +12,10 @@ __device__ int test_hip_atomic_load(int *pi32, unsigned int *pu32, long long *pl
val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
+ val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_CLUSTER);
val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
- val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, 6); // expected-error {{synchronization scope argument to atomic operation is invalid}}
+ val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, INVALID_HIP_MEMORY_SCOPE); // expected-error {{synchronization scope argument to atomic operation is invalid}}
val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
val = __hip_atomic_load(pi32, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD);
val = __hip_atomic_load(pi32, __ATOMIC_CONSUME, __HIP_MEMORY_SCOPE_SINGLETHREAD);
@@ -35,9 +38,10 @@ __device__ int test_hip_atomic_store(int *pi32, unsigned int *pu32, long long *p
__hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
__hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
__hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
+ __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_CLUSTER);
__hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
__hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
- __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, 6); // expected-error {{synchronization scope argument to atomic operation is invalid}}
+ __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, INVALID_HIP_MEMORY_SCOPE); // expected-error {{synchronization scope argument to atomic operation is invalid}}
__hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
__hip_atomic_store(pi32, 0, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD);
__hip_atomic_store(pi32, 0, __ATOMIC_CONSUME, __HIP_MEMORY_SCOPE_SINGLETHREAD); // expected-warning{{memory order argument to atomic operation is invalid}}
@@ -71,6 +75,7 @@ __device__ bool test_hip_atomic_cmpxchg_weak(int *ptr, int val, int desired) {
flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_CONSUME, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
+ flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_CLUSTER);
flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD);
>From f90b14c94b6e318d0e9a6f4e9805aef699e83a5f Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis at amd.com>
Date: Mon, 20 Oct 2025 14:52:02 -0500
Subject: [PATCH 2/2] fixup! [clang] Add support for cluster sync scope
---
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 6f281c7a3f843..f49a5af2c9587 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -314,23 +314,23 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
}
// Older builtins had an enum argument for the memory scope.
- const char *ssn = nullptr;
+ const char *SSN = nullptr;
int scope = cast<llvm::ConstantInt>(Scope)->getZExtValue();
switch (scope) {
case AtomicScopeGenericModel::System: // __MEMORY_SCOPE_SYSTEM
SSID = llvm::SyncScope::System;
break;
case AtomicScopeGenericModel::Device: // __MEMORY_SCOPE_DEVICE
- ssn = getTarget().getTriple().isSPIRV() ? "device" : "agent";
+ SSN = getTarget().getTriple().isSPIRV() ? "device" : "agent";
break;
case AtomicScopeGenericModel::Workgroup: // __MEMORY_SCOPE_WRKGRP
- ssn = "workgroup";
+ SSN = "workgroup";
break;
case AtomicScopeGenericModel::Cluster: // __MEMORY_SCOPE_CLUSTR
- ssn = getTarget().getTriple().isSPIRV() ? "workgroup" : "cluster";
+ SSN = getTarget().getTriple().isSPIRV() ? "workgroup" : "cluster";
break;
case AtomicScopeGenericModel::Wavefront: // __MEMORY_SCOPE_WVFRNT
- ssn = getTarget().getTriple().isSPIRV() ? "subgroup" : "wavefront";
+ SSN = getTarget().getTriple().isSPIRV() ? "subgroup" : "wavefront";
break;
case AtomicScopeGenericModel::Single: // __MEMORY_SCOPE_SINGLE
SSID = llvm::SyncScope::SingleThread;
@@ -339,8 +339,8 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
SSID = llvm::SyncScope::System;
break;
}
- if (ssn)
- SSID = getLLVMContext().getOrInsertSyncScopeID(ssn);
+ if (SSN)
+ SSID = getLLVMContext().getOrInsertSyncScopeID(SSN);
}
llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
More information about the cfe-commits
mailing list