[llvm] [AMDGPU] Remove `nosync` from image atomic intrinsics. (PR #76814)

Wed Jan 3 23:15:51 PST 2024

https://github.com/sstipanovic updated https://github.com/llvm/llvm-project/pull/76814

>From 82e782ffaf196a54a4d39aed80ceed5d21009350 Mon Sep 17 00:00:00 2001
From: Stefan Stipanovic <Stefan.Stipanovic at amd.com>
Date: Wed, 3 Jan 2024 14:08:21 +0100
Subject: [PATCH] [AMDGPU] Remove `nosync` from image atomic intrinsics.

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   7 +-
 .../CodeGen/AMDGPU/image-atomic-attributes.ll | 356 ++++++++++++++++++
 2 files changed, 360 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/image-atomic-attributes.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 531b1112354526..e5596258847f9f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -837,7 +837,7 @@ class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {
 // All dimension-aware intrinsics are derived from this class.
 class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
                               list<IntrinsicProperty> props,
-                              list<SDNodeProperty> sdnodeprops> : DefaultAttrsIntrinsic<
+                              list<SDNodeProperty> sdnodeprops> : Intrinsic<
     P_.RetTypes,        // vdata(VGPR) -- for load/atomic-with-return
     !listconcat(
       !foreach(arg, P_.DataArgs, arg.Type),      // vdata(VGPR) -- for store/atomic
@@ -851,11 +851,12 @@ class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
                                                  //   gfx12+ imm: bits [0-2] = th, bits [3-4] = scope)
                                                  // TODO-GFX12: Update all other cachepolicy descriptions.
 
-     !listconcat(props,
+     !listconcat(props, [IntrNoCallback, IntrNoFree, IntrWillReturn],
           !if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]),
           !if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []),
           [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>,
-           ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>]),
+           ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>],
+          !if(P_.IsAtomic, [], [IntrNoSync])),
 
 
       "", sdnodeprops>,
diff --git a/llvm/test/CodeGen/AMDGPU/image-atomic-attributes.ll b/llvm/test/CodeGen/AMDGPU/image-atomic-attributes.ll
new file mode 100644
index 00000000000000..dd60d9d702716d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/image-atomic-attributes.ll
@@ -0,0 +1,356 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 4
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown < %s | FileCheck -check-prefixes=CHECK %s
+
+define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_swap_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps <2 x float> @atomic_swap_1d_i64(<8 x i32> inreg %rsrc, i64 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps <2 x float> @atomic_swap_1d_i64(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i64 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i64 [[V]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[OUT]]
+;
+main_body:
+  %v = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i64 %v to <2 x float>
+  ret <2 x float> %out
+}
+
+define amdgpu_ps float @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_add_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_sub_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_smin_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_umin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_umin_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_smax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_smax_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_umax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_umax_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_and_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_and_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_or_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_or_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_xor_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_xor_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_inc_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_inc_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_dec_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_dec_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %swap, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_cmpswap_1d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[CMP:%.*]], i32 [[SWAP:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 [[CMP]], i32 [[SWAP]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps <2 x float> @atomic_cmpswap_1d_64(<8 x i32> inreg %rsrc, i64 %cmp, i64 %swap, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps <2 x float> @atomic_cmpswap_1d_64(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i64 [[CMP:%.*]], i64 [[SWAP:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 [[CMP]], i64 [[SWAP]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i64 [[V]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[OUT]]
+;
+main_body:
+  %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i64 %v to <2 x float>
+  ret <2 x float> %out
+}
+
+define amdgpu_ps float @atomic_add_2d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_add_2d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[T]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %r) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_add_3d(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[R:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[T]], i32 [[R]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32 %data, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %face) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_add_cube(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FACE:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[T]], i32 [[FACE]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32 %data, i32 %s, i32 %t, i32 %face, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_1darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %slice) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_add_1darray(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[SLICE:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[SLICE]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32 %data, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_add_2darray(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[T]], i32 [[SLICE]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %fragid) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_add_2dmsaa(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[FRAGID:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[T]], i32 [[FRAGID]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_add_2darraymsaa(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]], i32 [[FRAGID:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32 [[DATA]], i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 [[FRAGID]], <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+define amdgpu_ps float @atomic_add_1d_slc(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; CHECK-LABEL: define amdgpu_ps float @atomic_add_1d_slc(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[DATA:%.*]], i32 [[S:%.*]]) {
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 [[DATA]], i32 [[S]], <8 x i32> [[RSRC]], i32 0, i32 2)
+; CHECK-NEXT:    [[OUT:%.*]] = bitcast i32 [[V]] to float
+; CHECK-NEXT:    ret float [[OUT]]
+;
+main_body:
+  %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
+  %out = bitcast i32 %v to float
+  ret float %out
+}
+
+declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
+
+declare i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64, i32, <8 x i32>, i32, i32) #0
+declare i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64, i64, i32, <8 x i32>, i32, i32) #0
+
+declare i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn }
+;.