[llvm] 87b6f85 - AMDGPU: Add syncscopes to some atomic tests

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 8 11:38:27 PDT 2023


Author: Matt Arsenault
Date: 2023-08-08T14:38:06-04:00
New Revision: 87b6f85c2be4dea52b01066b8f5cd55f97864773

URL: https://github.com/llvm/llvm-project/commit/87b6f85c2be4dea52b01066b8f5cd55f97864773
DIFF: https://github.com/llvm/llvm-project/commit/87b6f85c2be4dea52b01066b8f5cd55f97864773.diff

LOG: AMDGPU: Add syncscopes to some atomic tests

These were not testing what was intended, which should be the cases we
can directly select to the instructions.

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
    llvm/test/CodeGen/AMDGPU/acc-ldst.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
    llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
    llvm/test/CodeGen/AMDGPU/flat_atomics.ll
    llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
    llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
    llvm/test/CodeGen/AMDGPU/global_atomics.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
    llvm/test/CodeGen/AMDGPU/mubuf.ll
    llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
    llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index cb9e35556ff7c6..feb65a5210d59d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -93,7 +93,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4
+  %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4
   store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -179,7 +179,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
-  %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i32 42 seq_cst, align 4
+  %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4
   store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -244,7 +244,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4
+  %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4
   ret void
 }
 
@@ -309,7 +309,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
-  %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i32 42 seq_cst, align 4
+  %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4
   ret void
 }
 
@@ -386,7 +386,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 seq_cst, align 4
+  %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
   store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -467,6 +467,89 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
+  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_dec_ret_i32_offset_system:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s2, s2, 16
+; CI-NEXT:    s_addc_u32 s3, s3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_wbinvl1_vol
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: global_atomic_dec_ret_i32_offset_system:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s2, s2, 16
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_dec_ret_i32_offset_system:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_atomic_dec_ret_i32_offset_system:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: global_atomic_dec_ret_i32_offset_system:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
@@ -536,7 +619,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 seq_cst, align 4
+  %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
   ret void
 }
 
@@ -605,6 +688,77 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    s_endpgm
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
+  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_dec_noret_i32_offset_system:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s0, s0, 16
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    flat_atomic_dec v[0:1], v2
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_wbinvl1_vol
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: global_atomic_dec_noret_i32_offset_system:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s0, 16
+; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    flat_atomic_dec v[0:1], v2
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_dec_noret_i32_offset_system:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_atomic_dec v1, v0, s[0:1] offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_atomic_dec_noret_i32_offset_system:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_dec v1, v0, s[0:1] offset:16
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: global_atomic_dec_noret_i32_offset_system:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_dec_u32 v1, v0, s[0:1] offset:16
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
@@ -702,7 +856,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
   %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
   %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id
   %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
-  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
+  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
   store i32 %result, ptr addrspace(1) %out.gep, align 4
   ret void
 }
@@ -782,7 +936,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
   %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
-  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
+  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
   ret void
 }
 
@@ -868,7 +1022,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw udec_wrap ptr %ptr, i32 42 seq_cst, align 4
+  %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4
   store i32 %result, ptr %out, align 4
   ret void
 }
@@ -960,6 +1114,100 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
+  store i32 %result, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_dec_ret_i32_offset_system:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s2, s2, 16
+; CI-NEXT:    s_addc_u32 s3, s3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_wbinvl1_vol
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: flat_atomic_dec_ret_i32_offset_system:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s2, s2, 16
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 42
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_dec v2, v[0:1], v2 offset:16 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    flat_store_dword v[0:1], v2
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 42
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_add_u32 s2, s2, 16
+; GFX10-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_dec v2, v[0:1], v2 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_system:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 42
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr %ptr, i32 4
   %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4
@@ -994,7 +1242,85 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: flat_atomic_dec_noret_i32:
+; GFX9-LABEL: flat_atomic_dec_noret_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 42
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_dec v[0:1], v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: flat_atomic_dec_noret_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 42
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_dec v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: flat_atomic_dec_noret_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 42
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    s_endpgm
+  %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_dec_noret_i32_offset:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s0, s0, 16
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    flat_atomic_dec v[0:1], v2
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_wbinvl1_vol
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: flat_atomic_dec_noret_i32_offset:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s0, 16
+; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    flat_atomic_dec v[0:1], v2
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: flat_atomic_dec_noret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
@@ -1002,16 +1328,18 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    flat_atomic_dec v[0:1], v2
+; GFX9-NEXT:    flat_atomic_dec v[0:1], v2 offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: flat_atomic_dec_noret_i32:
+; GFX10-LABEL: flat_atomic_dec_noret_i32_offset:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_add_u32 s0, s0, 16
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1023,7 +1351,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: flat_atomic_dec_noret_i32:
+; GFX11-LABEL: flat_atomic_dec_noret_i32_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 42
@@ -1031,18 +1359,19 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2
+; GFX11-NEXT:    flat_atomic_dec_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw udec_wrap ptr %ptr, i32 42 seq_cst, align 4
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
   ret void
 }
 
-define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
-; CI-LABEL: flat_atomic_dec_noret_i32_offset:
+define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
@@ -1057,7 +1386,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; CI-NEXT:    buffer_wbinvl1_vol
 ; CI-NEXT:    s_endpgm
 ;
-; VI-LABEL: flat_atomic_dec_noret_i32_offset:
+; VI-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
@@ -1072,7 +1401,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: flat_atomic_dec_noret_i32_offset:
+; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
@@ -1085,7 +1414,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: flat_atomic_dec_noret_i32_offset:
+; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
@@ -1103,7 +1432,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: flat_atomic_dec_noret_i32_offset:
+; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 42
@@ -1240,7 +1569,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
   %gep.tid = getelementptr i32, ptr %ptr, i32 %id
   %out.gep = getelementptr i32, ptr %out, i32 %id
   %gep = getelementptr i32, ptr %gep.tid, i32 5
-  %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4
+  %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
   store i32 %result, ptr %out.gep, align 4
   ret void
 }
@@ -1340,7 +1669,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, ptr %ptr, i32 %id
   %gep = getelementptr i32, ptr %gep.tid, i32 5
-  %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4
+  %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
   ret void
 }
 
@@ -1441,7 +1770,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw udec_wrap ptr %ptr, i64 42 seq_cst, align 8
+  %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
   store i64 %result, ptr %out, align 4
   ret void
 }
@@ -1550,7 +1879,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX11-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr %ptr, i32 4
-  %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8
+  %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
   store i64 %result, ptr %out, align 4
   ret void
 }
@@ -1630,7 +1959,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw udec_wrap ptr %ptr, i64 42 seq_cst, align 8
+  %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
   ret void
 }
 
@@ -1714,6 +2043,92 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    s_endpgm
+  %gep = getelementptr i64, ptr %ptr, i32 4
+  %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
+  ret void
+}
+
+define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_dec_noret_i64_offset_system:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s0, s0, 32
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_wbinvl1_vol
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: flat_atomic_dec_noret_i64_offset_system:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s0, 32
+; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_add_u32 s0, s0, 32
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_system:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_dec_u64 v[2:3], v[0:1] offset:32
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr %ptr, i32 4
   %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8
@@ -1850,7 +2265,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
   %gep.tid = getelementptr i64, ptr %ptr, i32 %id
   %out.gep = getelementptr i64, ptr %out, i32 %id
   %gep = getelementptr i64, ptr %gep.tid, i32 5
-  %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8
+  %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
   store i64 %result, ptr %out.gep, align 4
   ret void
 }
@@ -1955,7 +2370,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, ptr %ptr, i32 %id
   %gep = getelementptr i64, ptr %gep.tid, i32 5
-  %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8
+  %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
   ret void
 }
 
@@ -2046,7 +2461,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0
-  %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i32 9 seq_cst, align 4
+  %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i32 9 syncscope("agent") seq_cst, align 4
   store i32 %idx.0, ptr addrspace(1) %add_use, align 4
   store i32 %result, ptr addrspace(1) %out, align 4
   ret void
@@ -2137,7 +2552,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 seq_cst, align 8
+  %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8
   store i64 %result, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -2228,7 +2643,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
-  %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i64 42 seq_cst, align 8
+  %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8
   store i64 %result, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -2298,7 +2713,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 seq_cst, align 8
+  %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8
   ret void
 }
 
@@ -2368,7 +2783,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
-  %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i64 42 seq_cst, align 8
+  %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8
   ret void
 }
 
@@ -2450,7 +2865,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 seq_cst, align 8
+  %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
   store i64 %result, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -2536,6 +2951,94 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
+  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
+  store i64 %result, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_dec_ret_i64_offset_system:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s2, s2, 32
+; CI-NEXT:    s_addc_u32 s3, s3, 0
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_wbinvl1_vol
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: global_atomic_dec_ret_i64_offset_system:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s2, s2, 32
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_dec_ret_i64_offset_system:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_atomic_dec_ret_i64_offset_system:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: global_atomic_dec_ret_i64_offset_system:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
@@ -2610,7 +3113,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 seq_cst, align 8
+  %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
   ret void
 }
 
@@ -2684,6 +3187,82 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    s_endpgm
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
+  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_dec_noret_i64_offset_system:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s0, s0, 32
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_wbinvl1_vol
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: global_atomic_dec_noret_i64_offset_system:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s0, 32
+; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_dec_noret_i64_offset_system:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_atomic_dec_noret_i64_offset_system:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: global_atomic_dec_noret_i64_offset_system:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
@@ -2786,7 +3365,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
   %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
   %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id
   %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
-  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
+  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
   store i64 %result, ptr addrspace(1) %out.gep, align 4
   ret void
 }
@@ -2871,7 +3450,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
   %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
-  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
+  %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
   ret void
 }
 
@@ -2967,7 +3546,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0
-  %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 seq_cst, align 8
+  %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8
   store i32 %idx.0, ptr addrspace(1) %add_use, align 4
   store i64 %result, ptr addrspace(1) %out, align 4
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index cabb67d073702a..009a66721de159 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -93,7 +93,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4
+  %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4
   store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -179,7 +179,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
-  %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i32 42 seq_cst, align 4
+  %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4
   store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -244,7 +244,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4
+  %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4
   ret void
 }
 
@@ -309,7 +309,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
-  %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i32 42 seq_cst, align 4
+  %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4
   ret void
 }
 
@@ -386,7 +386,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 seq_cst, align 4
+  %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
   store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -467,6 +467,89 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
+  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s2, s2, 16
+; CI-NEXT:    s_addc_u32 s3, s3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_wbinvl1_vol
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s2, s2, 16
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_inc_ret_i32_offset_sistem:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_atomic_inc_ret_i32_offset_sistem:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: global_atomic_inc_ret_i32_offset_sistem:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
@@ -536,7 +619,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 seq_cst, align 4
+  %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
   ret void
 }
 
@@ -605,6 +688,77 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    s_endpgm
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
+  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_inc_noret_i32_offset_system:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s0, s0, 16
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    flat_atomic_inc v[0:1], v2
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_wbinvl1_vol
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: global_atomic_inc_noret_i32_offset_system:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s0, 16
+; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    flat_atomic_inc v[0:1], v2
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_inc_noret_i32_offset_system:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_atomic_inc v1, v0, s[0:1] offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_atomic_inc_noret_i32_offset_system:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_inc v1, v0, s[0:1] offset:16
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: global_atomic_inc_noret_i32_offset_system:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_inc_u32 v1, v0, s[0:1] offset:16
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
@@ -702,7 +856,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
   %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
   %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id
   %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
-  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
+  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
   store i32 %result, ptr addrspace(1) %out.gep, align 4
   ret void
 }
@@ -782,7 +936,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
   %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
-  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
+  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
   ret void
 }
 
@@ -873,7 +1027,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0
-  %result = atomicrmw uinc_wrap ptr addrspace(3) %arrayidx0, i32 9 seq_cst, align 4
+  %result = atomicrmw uinc_wrap ptr addrspace(3) %arrayidx0, i32 9 syncscope("agent") seq_cst, align 4
   store i32 %idx.0, ptr addrspace(1) %add_use, align 4
   store i32 %result, ptr addrspace(1) %out, align 4
   ret void
@@ -964,7 +1118,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i64 42 seq_cst, align 8
+  %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8
   store i64 %result, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -1055,7 +1209,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
-  %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i64 42 seq_cst, align 8
+  %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8
   store i64 %result, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -1125,7 +1279,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i64 42 seq_cst, align 8
+  %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8
   ret void
 }
 
@@ -1195,7 +1349,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
-  %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i64 42 seq_cst, align 8
+  %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8
   ret void
 }
 
@@ -1277,7 +1431,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 seq_cst, align 8
+  %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
   store i64 %result, ptr addrspace(1) %out, align 4
   ret void
 }
@@ -1363,6 +1517,94 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
+  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
+  store i64 %result, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_inc_ret_i64_offset_system:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s2, s2, 32
+; CI-NEXT:    s_addc_u32 s3, s3, 0
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_wbinvl1_vol
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: global_atomic_inc_ret_i64_offset_system:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s2, s2, 32
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_inc_ret_i64_offset_system:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_atomic_inc_ret_i64_offset_system:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: global_atomic_inc_ret_i64_offset_system:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
@@ -1437,7 +1679,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 seq_cst, align 8
+  %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
   ret void
 }
 
@@ -1511,6 +1753,82 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    s_endpgm
+  %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
+  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_inc_noret_i64_offset_system:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s0, s0, 32
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_wbinvl1_vol
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: global_atomic_inc_noret_i64_offset_system:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s0, 32
+; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_inc_noret_i64_offset_system:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_atomic_inc_noret_i64_offset_system:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: global_atomic_inc_noret_i64_offset_system:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
@@ -1613,7 +1931,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
   %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
   %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id
   %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
-  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
+  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
   store i64 %result, ptr addrspace(1) %out.gep, align 4
   ret void
 }
@@ -1698,7 +2016,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
   %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
-  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
+  %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
   ret void
 }
 
@@ -1784,7 +2102,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw uinc_wrap ptr %ptr, i32 42 seq_cst, align 4
+  %result = atomicrmw uinc_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4
   store i32 %result, ptr %out, align 4
   ret void
 }
@@ -1834,61 +2152,231 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    flat_atomic_inc v2, v[0:1], v2 offset:16 glc
+; GFX9-NEXT:    flat_atomic_inc v2, v[0:1], v2 offset:16 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    flat_store_dword v[0:1], v2
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: flat_atomic_inc_ret_i32_offset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 42
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_add_u32 s2, s2, 16
+; GFX10-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: flat_atomic_inc_ret_i32_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 42
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
+  store i32 %result, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_inc_ret_i32_offset_system:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s2, s2, 16
+; CI-NEXT:    s_addc_u32 s3, s3, 0
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_wbinvl1_vol
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    flat_store_dword v[0:1], v2
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: flat_atomic_inc_ret_i32_offset_system:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s2, s2, 16
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 42
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_inc v2, v[0:1], v2 offset:16 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    flat_store_dword v[0:1], v2
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 42
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_add_u32 s2, s2, 16
+; GFX10-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    flat_store_dword v[0:1], v2
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: flat_atomic_inc_ret_i32_offset_system:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 42
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    flat_store_b32 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4
+  store i32 %result, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_inc_noret_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v2, 42
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    flat_atomic_inc v[0:1], v2
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_wbinvl1_vol
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: flat_atomic_inc_noret_i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v2, 42
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    flat_atomic_inc v[0:1], v2
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: flat_atomic_inc_noret_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 42
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_inc v[0:1], v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: flat_atomic_inc_ret_i32_offset:
+; GFX10-LABEL: flat_atomic_inc_noret_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_add_u32 s2, s2, 16
-; GFX10-NEXT:    s_addc_u32 s3, s3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_inc v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    buffer_gl1_inv
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: flat_atomic_inc_ret_i32_offset:
+; GFX11-LABEL: flat_atomic_inc_noret_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
-  %gep = getelementptr i32, ptr %ptr, i32 4
-  %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4
-  store i32 %result, ptr %out, align 4
+  %result = atomicrmw uinc_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4
   ret void
 }
 
-define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
-; CI-LABEL: flat_atomic_inc_noret_i32:
+define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_inc_noret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s0, s0, 16
+; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1897,11 +2385,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; CI-NEXT:    buffer_wbinvl1_vol
 ; CI-NEXT:    s_endpgm
 ;
-; VI-LABEL: flat_atomic_inc_noret_i32:
+; VI-LABEL: flat_atomic_inc_noret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s0, 16
+; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1910,7 +2400,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: flat_atomic_inc_noret_i32:
+; GFX9-LABEL: flat_atomic_inc_noret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
@@ -1918,16 +2408,18 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    flat_atomic_inc v[0:1], v2
+; GFX9-NEXT:    flat_atomic_inc v[0:1], v2 offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: flat_atomic_inc_noret_i32:
+; GFX10-LABEL: flat_atomic_inc_noret_i32_offset:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_add_u32 s0, s0, 16
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1939,7 +2431,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: flat_atomic_inc_noret_i32:
+; GFX11-LABEL: flat_atomic_inc_noret_i32_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 42
@@ -1947,18 +2439,19 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2
+; GFX11-NEXT:    flat_atomic_inc_u32 v[0:1], v2 offset:16
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw uinc_wrap ptr %ptr, i32 42 seq_cst, align 4
+  %gep = getelementptr i32, ptr %ptr, i32 4
+  %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
   ret void
 }
 
-define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
-; CI-LABEL: flat_atomic_inc_noret_i32_offset:
+define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
@@ -1973,7 +2466,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; CI-NEXT:    buffer_wbinvl1_vol
 ; CI-NEXT:    s_endpgm
 ;
-; VI-LABEL: flat_atomic_inc_noret_i32_offset:
+; VI-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
@@ -1988,7 +2481,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: flat_atomic_inc_noret_i32_offset:
+; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
@@ -2001,7 +2494,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: flat_atomic_inc_noret_i32_offset:
+; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
@@ -2019,7 +2512,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: flat_atomic_inc_noret_i32_offset:
+; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 42
@@ -2156,7 +2649,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
   %gep.tid = getelementptr i32, ptr %ptr, i32 %id
   %out.gep = getelementptr i32, ptr %out, i32 %id
   %gep = getelementptr i32, ptr %gep.tid, i32 5
-  %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4
+  %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
   store i32 %result, ptr %out.gep, align 4
   ret void
 }
@@ -2256,7 +2749,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, ptr %ptr, i32 %id
   %gep = getelementptr i32, ptr %gep.tid, i32 5
-  %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4
+  %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
   ret void
 }
 
@@ -2352,7 +2845,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0
-  %result = atomicrmw uinc_wrap ptr addrspace(3) %arrayidx0, i64 9 seq_cst, align 8
+  %result = atomicrmw uinc_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8
   store i32 %idx.0, ptr addrspace(1) %add_use, align 4
   store i64 %result, ptr addrspace(1) %out, align 4
   ret void
@@ -2455,7 +2948,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw uinc_wrap ptr %ptr, i64 42 seq_cst, align 8
+  %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
   store i64 %result, ptr %out, align 4
   ret void
 }
@@ -2562,6 +3055,115 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX11-NEXT:    s_endpgm
+  %gep = getelementptr i64, ptr %ptr, i32 4
+  %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
+  store i64 %result, ptr %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_inc_ret_i64_offset_system:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s2, s2, 32
+; CI-NEXT:    s_addc_u32 s3, s3, 0
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_wbinvl1_vol
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    s_add_u32 s0, s0, 4
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    v_mov_b32_e32 v5, s1
+; CI-NEXT:    v_mov_b32_e32 v4, s0
+; CI-NEXT:    flat_store_dword v[2:3], v0
+; CI-NEXT:    flat_store_dword v[4:5], v1
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: flat_atomic_inc_ret_i64_offset_system:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s2, s2, 32
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_add_u32 s0, s0, 4
+; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    flat_store_dword v[2:3], v0
+; VI-NEXT:    flat_store_dword v[4:5], v1
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_add_u32 s2, s2, 32
+; GFX10-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: flat_atomic_inc_ret_i64_offset_system:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8
@@ -2644,7 +3246,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
-  %result = atomicrmw uinc_wrap ptr %ptr, i64 42 seq_cst, align 8
+  %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
   ret void
 }
 
@@ -2728,6 +3330,92 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    s_endpgm
+  %gep = getelementptr i64, ptr %ptr, i32 4
+  %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
+  ret void
+}
+
+define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_inc_noret_i64_offset_system:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_u32 s0, s0, 32
+; CI-NEXT:    s_addc_u32 s1, s1, 0
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
+; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_wbinvl1_vol
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: flat_atomic_inc_noret_i64_offset_system:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s0, 32
+; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 42
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_add_u32 s0, s0, 32
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_system:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1] offset:32
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    s_endpgm
   %gep = getelementptr i64, ptr %ptr, i32 4
   %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8
@@ -2864,7 +3552,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
   %gep.tid = getelementptr i64, ptr %ptr, i32 %id
   %out.gep = getelementptr i64, ptr %out, i32 %id
   %gep = getelementptr i64, ptr %gep.tid, i32 5
-  %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8
+  %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
   store i64 %result, ptr %out.gep, align 4
   ret void
 }
@@ -2969,7 +3657,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, ptr %ptr, i32 %id
   %gep = getelementptr i64, ptr %gep.tid, i32 5
-  %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8
+  %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
   ret void
 }
 
@@ -3078,8 +3766,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %result0 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4
-  %result1 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4
+  %result0 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4
+  %result1 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4
   store i32 %result0, ptr addrspace(1) %out0, align 4
   store i32 %result1, ptr addrspace(1) %out1, align 4
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index 17a0e93dd4a221..c0a9cdf3a9344a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -927,7 +927,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(ptr addrspace(1) inr
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
-  %result = atomicrmw add ptr addrspace(1) %gep, i32 2 seq_cst
+  %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
   %cast = bitcast i32 %result to float
   ret float %cast
 }
@@ -968,7 +968,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
-  %result = atomicrmw add ptr addrspace(1) %gep, i32 2 seq_cst
+  %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
   %cast = bitcast i32 %result to float
   ret float %cast
 }
@@ -1003,7 +1003,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4095(ptr addrspace(1) %pt
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
-  %result = atomicrmw add ptr addrspace(1) %gep, i32 2 seq_cst
+  %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
   %cast = bitcast i32 %result to float
   ret float %cast
 }
@@ -1038,7 +1038,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace(
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
-  %result = atomicrmw add ptr addrspace(1) %gep, i32 2 seq_cst
+  %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
   %cast = bitcast i32 %result to float
   ret float %cast
 }
@@ -1077,7 +1077,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
-  %result = atomicrmw add ptr addrspace(1) %gep, i32 2 seq_cst
+  %result = atomicrmw add ptr addrspace(1) %gep, i32 2 syncscope("agent") seq_cst
   %cast = bitcast i32 %result to float
   ret float %cast
 }
@@ -1114,7 +1114,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4095(ptr addrspace(1) inreg
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX7-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
-  %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
+  %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   %result = extractvalue { i32, i1 } %result.struct, 0
   %cast = bitcast i32 %result to float
   ret float %cast
@@ -1158,7 +1158,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX7-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
-  %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
+  %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   %result = extractvalue { i32, i1 } %result.struct, 0
   %cast = bitcast i32 %result to float
   ret float %cast
@@ -1194,7 +1194,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4095(ptr addrspace(1) %ptr,
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
-  %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
+  %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   %result = extractvalue { i32, i1 } %result.struct, 0
   %cast = bitcast i32 %result to float
   ret float %cast
@@ -1230,7 +1230,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
-  %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
+  %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   %result = extractvalue { i32, i1 } %result.struct, 0
   %cast = bitcast i32 %result to float
   ret float %cast
@@ -1270,7 +1270,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    ; return to shader part epilog
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
-  %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
+  %result.struct = cmpxchg ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   %result = extractvalue { i32, i1 } %result.struct, 0
   %cast = bitcast i32 %result to float
   ret float %cast

diff  --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
index 9e4569afd6f2e1..1d829c056bd976 100644
--- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
+++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
@@ -202,14 +202,14 @@ define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic_store(ptr addrspace(1)
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tid
-  %in.1 = atomicrmw volatile sub ptr addrspace(1) %gep, i32 1 seq_cst
+  %in.1 = atomicrmw volatile sub ptr addrspace(1) %gep, i32 1 syncscope("agent") seq_cst
   %tmp0 = insertelement <4 x i32> undef, i32 %in.1, i32 0
   %tmp1 = insertelement <4 x i32> %tmp0, i32 0, i32 1
   %tmp2 = insertelement <4 x i32> %tmp1, i32 0, i32 2
   %tmp3 = insertelement <4 x i32> %tmp2, i32 0, i32 3
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %tmp3, i32 0, i32 0, i32 0)
   %elt = extractelement <4 x i32> %mai.1, i32 0
-  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %elt seq_cst
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %elt syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %arg
   ret void
 }
@@ -225,7 +225,7 @@ define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic64_store(ptr addrspace(1
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tid
-  %in.1 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 1 seq_cst
+  %in.1 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 1 syncscope("agent") seq_cst
   %tmp0 = insertelement <2 x i64> undef, i64 %in.1, i32 0
   %tmp1 = insertelement <2 x i64> %tmp0, i64 0, i32 1
   %tmp2 = bitcast <2 x i64> %tmp0 to <4 x i32>
@@ -235,7 +235,7 @@ bb:
   %v2.1 = insertelement <2 x i32> undef, i32 %elt.1, i32 0
   %v2.2 = insertelement <2 x i32> %v2.1, i32 %elt.2, i32 1
   %v2 = bitcast <2 x i32> %v2.2 to i64
-  %val = atomicrmw volatile add ptr addrspace(1) %gep, i64 %v2 seq_cst
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i64 %v2 syncscope("agent") seq_cst
   store i64 %val, ptr addrspace(1) %arg
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index e54d6e0d470d88..81fd166e3779f8 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -226,7 +226,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw add ptr addrspace(1) %inout, i32 5 acq_rel
+  %old = atomicrmw add ptr addrspace(1) %inout, i32 5 syncscope("agent") acq_rel
   store i32 %old, ptr addrspace(1) %out
   ret void
 }
@@ -493,7 +493,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw add ptr addrspace(1) %inout, i32 %additive acq_rel
+  %old = atomicrmw add ptr addrspace(1) %inout, i32 %additive syncscope("agent") acq_rel
   store i32 %old, ptr addrspace(1) %out
   ret void
 }
@@ -823,7 +823,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
-  %old = atomicrmw add ptr addrspace(1) %inout, i32 %lane acq_rel
+  %old = atomicrmw add ptr addrspace(1) %inout, i32 %lane  syncscope("agent") acq_rel
   store i32 %old, ptr addrspace(1) %out
   ret void
 }
@@ -1062,7 +1062,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw add ptr addrspace(1) %inout, i64 5 acq_rel
+  %old = atomicrmw add ptr addrspace(1) %inout, i64 5  syncscope("agent") acq_rel
   store i64 %old, ptr addrspace(1) %out
   ret void
 }
@@ -1381,7 +1381,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw add ptr addrspace(1) %inout, i64 %additive acq_rel
+  %old = atomicrmw add ptr addrspace(1) %inout, i64 %additive  syncscope("agent") acq_rel
   store i64 %old, ptr addrspace(1) %out
   ret void
 }
@@ -1475,7 +1475,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %zext = zext i32 %lane to i64
-  %old = atomicrmw add ptr addrspace(1) %inout, i64 %zext acq_rel
+  %old = atomicrmw add ptr addrspace(1) %inout, i64 %zext syncscope("agent") acq_rel
   store i64 %old, ptr addrspace(1) %out
   ret void
 }
@@ -1735,7 +1735,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw sub ptr addrspace(1) %inout, i32 5 acq_rel
+  %old = atomicrmw sub ptr addrspace(1) %inout, i32 5 syncscope("agent") acq_rel
   store i32 %old, ptr addrspace(1) %out
   ret void
 }
@@ -2006,7 +2006,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw sub ptr addrspace(1) %inout, i32 %subitive acq_rel
+  %old = atomicrmw sub ptr addrspace(1) %inout, i32 %subitive syncscope("agent") acq_rel
   store i32 %old, ptr addrspace(1) %out
   ret void
 }
@@ -2336,7 +2336,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
-  %old = atomicrmw sub ptr addrspace(1) %inout, i32 %lane acq_rel
+  %old = atomicrmw sub ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel
   store i32 %old, ptr addrspace(1) %out
   ret void
 }
@@ -2627,7 +2627,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw sub ptr addrspace(1) %inout, i64 5 acq_rel
+  %old = atomicrmw sub ptr addrspace(1) %inout, i64 5 syncscope("agent") acq_rel
   store i64 %old, ptr addrspace(1) %out
   ret void
 }
@@ -2959,7 +2959,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 entry:
-  %old = atomicrmw sub ptr addrspace(1) %inout, i64 %subitive acq_rel
+  %old = atomicrmw sub ptr addrspace(1) %inout, i64 %subitive syncscope("agent") acq_rel
   store i64 %old, ptr addrspace(1) %out
   ret void
 }
@@ -3053,7 +3053,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %zext = zext i32 %lane to i64
-  %old = atomicrmw sub ptr addrspace(1) %inout, i64 %zext acq_rel
+  %old = atomicrmw sub ptr addrspace(1) %inout, i64 %zext syncscope("agent") acq_rel
   store i64 %old, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
index feaca01e520c79..2e56839587f329 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
@@ -20,7 +20,7 @@ define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) %
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw add ptr addrspace(1) %p, i32 1 monotonic
+  %n32 = atomicrmw add ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
   %n64 = zext i32 %n32 to i64
   %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
   store float 1.0, ptr addrspace(1) %p1
@@ -42,7 +42,7 @@ define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) %
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw sub ptr addrspace(1) %p, i32 1 monotonic
+  %n32 = atomicrmw sub ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
   %n64 = zext i32 %n32 to i64
   %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
   store float 1.0, ptr addrspace(1) %p1
@@ -64,7 +64,7 @@ define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) %
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw and ptr addrspace(1) %p, i32 1 monotonic
+  %n32 = atomicrmw and ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
   %n64 = zext i32 %n32 to i64
   %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
   store float 1.0, ptr addrspace(1) %p1
@@ -86,7 +86,7 @@ define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw or ptr addrspace(1) %p, i32 1 monotonic
+  %n32 = atomicrmw or ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
   %n64 = zext i32 %n32 to i64
   %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
   store float 1.0, ptr addrspace(1) %p1
@@ -108,7 +108,7 @@ define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) %
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw xor ptr addrspace(1) %p, i32 1 monotonic
+  %n32 = atomicrmw xor ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
   %n64 = zext i32 %n32 to i64
   %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
   store float 1.0, ptr addrspace(1) %p1
@@ -144,7 +144,7 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw nand ptr addrspace(1) %p, i32 1 monotonic
+  %n32 = atomicrmw nand ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
   %n64 = zext i32 %n32 to i64
   %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
   store float 1.0, ptr addrspace(1) %p1
@@ -177,31 +177,18 @@ define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) %
 ; CHECK-LABEL: max:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_load_dword s6, s[0:1], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:  .LBB7_1: ; %atomicrmw.start
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_mov_b32_e32 v3, v0
-; CHECK-NEXT:    v_max_i32_e32 v2, 1, v3
-; CHECK-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
+; CHECK-NEXT:    global_atomic_smax v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_cbranch_execnz .LBB7_1
-; CHECK-NEXT:  ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    v_mov_b32_e32 v2, s2
-; CHECK-NEXT:    v_mov_b32_e32 v3, s3
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw max ptr addrspace(1) %p, i32 1 monotonic
+  %n32 = atomicrmw max ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
   %n64 = zext i32 %n32 to i64
   %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
   store float 1.0, ptr addrspace(1) %p1
@@ -234,31 +221,18 @@ define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) %
 ; CHECK-LABEL: min:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_load_dword s6, s[0:1], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:  .LBB9_1: ; %atomicrmw.start
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_mov_b32_e32 v3, v0
-; CHECK-NEXT:    v_min_i32_e32 v2, 1, v3
-; CHECK-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
+; CHECK-NEXT:    global_atomic_smin v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_cbranch_execnz .LBB9_1
-; CHECK-NEXT:  ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    v_mov_b32_e32 v2, s2
-; CHECK-NEXT:    v_mov_b32_e32 v3, s3
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw min ptr addrspace(1) %p, i32 1 monotonic
+  %n32 = atomicrmw min ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
   %n64 = zext i32 %n32 to i64
   %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
   store float 1.0, ptr addrspace(1) %p1
@@ -291,31 +265,18 @@ define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1)
 ; CHECK-LABEL: umax:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_load_dword s6, s[0:1], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:  .LBB11_1: ; %atomicrmw.start
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_mov_b32_e32 v3, v0
-; CHECK-NEXT:    v_max_u32_e32 v2, 1, v3
-; CHECK-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
+; CHECK-NEXT:    global_atomic_umax v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_cbranch_execnz .LBB11_1
-; CHECK-NEXT:  ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    v_mov_b32_e32 v2, s2
-; CHECK-NEXT:    v_mov_b32_e32 v3, s3
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw umax ptr addrspace(1) %p, i32 1 monotonic
+  %n32 = atomicrmw umax ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
   %n64 = zext i32 %n32 to i64
   %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
   store float 1.0, ptr addrspace(1) %p1
@@ -348,31 +309,18 @@ define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1)
 ; CHECK-LABEL: umin:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_load_dword s6, s[0:1], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:  .LBB13_1: ; %atomicrmw.start
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_mov_b32_e32 v3, v0
-; CHECK-NEXT:    v_min_u32_e32 v2, 1, v3
-; CHECK-NEXT:    global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
+; CHECK-NEXT:    global_atomic_umin v2, v0, v1, s[0:1] glc
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_cbranch_execnz .LBB13_1
-; CHECK-NEXT:  ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    v_mov_b32_e32 v2, s2
-; CHECK-NEXT:    v_mov_b32_e32 v3, s3
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw umin ptr addrspace(1) %p, i32 1 monotonic
+  %n32 = atomicrmw umin ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
   %n64 = zext i32 %n32 to i64
   %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
   store float 1.0, ptr addrspace(1) %p1
@@ -418,7 +366,7 @@ define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw xchg ptr addrspace(1) %p, i32 1 monotonic
+  %n32 = atomicrmw xchg ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
   %n64 = zext i32 %n32 to i64
   %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
   store float 1.0, ptr addrspace(1) %p1
@@ -440,7 +388,7 @@ define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw uinc_wrap ptr addrspace(1) %p, i32 1 monotonic
+  %n32 = atomicrmw uinc_wrap ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
   %n64 = zext i32 %n32 to i64
   %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
   store float 1.0, ptr addrspace(1) %p1
@@ -462,7 +410,7 @@ define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %n32 = atomicrmw udec_wrap ptr addrspace(1) %p, i32 1 monotonic
+  %n32 = atomicrmw udec_wrap ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
   %n64 = zext i32 %n32 to i64
   %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
   store float 1.0, ptr addrspace(1) %p1
@@ -498,7 +446,7 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %f32 = atomicrmw fadd ptr addrspace(1) %p, float 1.0 monotonic
+  %f32 = atomicrmw fadd ptr addrspace(1) %p, float 1.0 syncscope("agent") monotonic
   %n32 = fptoui float %f32 to i32
   %n64 = zext i32 %n32 to i64
   %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
@@ -535,7 +483,7 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
-  %f32 = atomicrmw fsub ptr addrspace(1) %p, float 1.0 monotonic
+  %f32 = atomicrmw fsub ptr addrspace(1) %p, float 1.0 syncscope("agent") monotonic
   %n32 = fptoui float %f32 to i32
   %n64 = zext i32 %n32 to i64
   %p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0

diff  --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index b3046851c9ad10..451fa3ba89b7d6 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -51,7 +51,7 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
-  %val = atomicrmw volatile add ptr %gep, i32 %in seq_cst
+  %val = atomicrmw add ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -103,7 +103,7 @@ define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 1023
-  %val = atomicrmw volatile add ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -157,7 +157,7 @@ define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 1024
-  %val = atomicrmw volatile add ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -218,7 +218,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
-  %val = atomicrmw volatile add ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -284,7 +284,7 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
-  %val = atomicrmw volatile add ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -358,7 +358,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2,
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
-  %val = atomicrmw volatile add ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -406,7 +406,7 @@ define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile add ptr %out, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr %out, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -462,7 +462,7 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile add ptr %out, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -523,7 +523,7 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
-  %val = atomicrmw volatile add ptr %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -592,7 +592,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
-  %val = atomicrmw volatile add ptr %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -645,7 +645,7 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
-  %val = atomicrmw volatile and ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -706,7 +706,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
-  %val = atomicrmw volatile and ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -772,7 +772,7 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
-  %val = atomicrmw volatile and ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -846,7 +846,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2,
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
-  %val = atomicrmw volatile and ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -894,7 +894,7 @@ define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile and ptr %out, i32 %in seq_cst
+  %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -950,7 +950,7 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile and ptr %out, i32 %in seq_cst
+  %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -1011,7 +1011,7 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
-  %val = atomicrmw volatile and ptr %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -1080,7 +1080,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
-  %val = atomicrmw volatile and ptr %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -1133,7 +1133,7 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
-  %val = atomicrmw volatile sub ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -1194,7 +1194,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
-  %val = atomicrmw volatile sub ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -1260,7 +1260,7 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
-  %val = atomicrmw volatile sub ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -1334,7 +1334,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2,
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
-  %val = atomicrmw volatile sub ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -1382,7 +1382,7 @@ define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile sub ptr %out, i32 %in seq_cst
+  %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -1438,7 +1438,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile sub ptr %out, i32 %in seq_cst
+  %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -1499,7 +1499,7 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
-  %val = atomicrmw volatile sub ptr %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -1568,7 +1568,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
-  %val = atomicrmw volatile sub ptr %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -3525,7 +3525,7 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
-  %val = atomicrmw volatile or ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3586,7 +3586,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
-  %val = atomicrmw volatile or ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -3652,7 +3652,7 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
-  %val = atomicrmw volatile or ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3726,7 +3726,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2,
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
-  %val = atomicrmw volatile or ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -3774,7 +3774,7 @@ define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile or ptr %out, i32 %in seq_cst
+  %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3830,7 +3830,7 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile or ptr %out, i32 %in seq_cst
+  %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -3891,7 +3891,7 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) {
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
-  %val = atomicrmw volatile or ptr %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3960,7 +3960,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
-  %val = atomicrmw volatile or ptr %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -4013,7 +4013,7 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
-  %val = atomicrmw volatile xchg ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4065,7 +4065,7 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) {
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr float, ptr %out, i32 4
-  %val = atomicrmw volatile xchg ptr %gep, float %in seq_cst
+  %val = atomicrmw volatile xchg ptr %gep, float %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4126,7 +4126,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 %
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
-  %val = atomicrmw volatile xchg ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -4192,7 +4192,7 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
-  %val = atomicrmw volatile xchg ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4266,7 +4266,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
-  %val = atomicrmw volatile xchg ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -4314,7 +4314,7 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
+  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4370,7 +4370,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
+  %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -4431,7 +4431,7 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
-  %val = atomicrmw volatile xchg ptr %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4500,7 +4500,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
-  %val = atomicrmw volatile xchg ptr %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -4555,7 +4555,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
-  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   ret void
 }
 
@@ -4619,7 +4619,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
-  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   %flag = extractvalue { i32, i1 } %val, 0
   store i32 %flag, ptr %out2
   ret void
@@ -4692,7 +4692,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
-  %val  = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst seq_cst
+  %val  = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   ret void
 }
 
@@ -4772,7 +4772,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
-  %val  = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst seq_cst
+  %val  = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   %flag = extractvalue { i32, i1 } %val, 0
   store i32 %flag, ptr %out2
   ret void
@@ -4821,7 +4821,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = cmpxchg volatile ptr %out, i32 %old, i32 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   ret void
 }
 
@@ -4880,7 +4880,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in,
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = cmpxchg volatile ptr %out, i32 %old, i32 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   %flag = extractvalue { i32, i1 } %val, 0
   store i32 %flag, ptr %out2
   ret void
@@ -4948,7 +4948,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
-  %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   ret void
 }
 
@@ -5023,7 +5023,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
-  %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   %flag = extractvalue { i32, i1 } %val, 0
   store i32 %flag, ptr %out2
   ret void
@@ -5077,7 +5077,7 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
-  %val = atomicrmw volatile xor ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -5138,7 +5138,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
-  %val = atomicrmw volatile xor ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -5204,7 +5204,7 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
-  %val = atomicrmw volatile xor ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -5278,7 +5278,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2,
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
-  %val = atomicrmw volatile xor ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -5326,7 +5326,7 @@ define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile xor ptr %out, i32 %in seq_cst
+  %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -5382,7 +5382,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile xor ptr %out, i32 %in seq_cst
+  %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -5443,7 +5443,7 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
-  %val = atomicrmw volatile xor ptr %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -5512,7 +5512,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
-  %val = atomicrmw volatile xor ptr %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -7152,7 +7152,7 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
-  %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -7204,7 +7204,7 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 1023
-  %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -7258,7 +7258,7 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 1024
-  %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -7319,7 +7319,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
-  %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -7385,7 +7385,7 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
-  %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -7459,7 +7459,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2,
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
-  %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -7507,7 +7507,7 @@ define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in seq_cst
+  %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -7563,7 +7563,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in seq_cst
+  %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -7624,7 +7624,7 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
-  %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -7693,7 +7693,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
-  %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -7746,7 +7746,7 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
-  %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -7798,7 +7798,7 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 1023
-  %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -7852,7 +7852,7 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) {
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 1024
-  %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -7913,7 +7913,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr %out, i32 4
-  %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -7979,7 +7979,7 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
-  %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -8053,7 +8053,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2,
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
   %gep = getelementptr i32, ptr %ptr, i32 4
-  %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in seq_cst
+  %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -8101,7 +8101,7 @@ define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile udec_wrap ptr %out, i32 %in seq_cst
+  %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -8157,7 +8157,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) {
 ; GCN3-NEXT:    flat_store_dword v[0:1], v2
 ; GCN3-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile udec_wrap ptr %out, i32 %in seq_cst
+  %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }
@@ -8218,7 +8218,7 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index)
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
-  %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -8287,7 +8287,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i
 ; GCN3-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr %out, i64 %index
-  %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr %out2
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index fea4645e6cd488..3186a06be22fbe 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -36,7 +36,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw volatile add ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -82,7 +82,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw volatile add ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -130,7 +130,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
-  %tmp0 = atomicrmw volatile add ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -181,7 +181,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
-  %tmp0 = atomicrmw volatile add ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -215,7 +215,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile add ptr %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -256,7 +256,7 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile add ptr %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -299,7 +299,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
-  %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -345,7 +345,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
-  %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -384,7 +384,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw volatile and ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -430,7 +430,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw volatile and ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -478,7 +478,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
-  %tmp0 = atomicrmw volatile and ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -529,7 +529,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
-  %tmp0 = atomicrmw volatile and ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -563,7 +563,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile and ptr %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -604,7 +604,7 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile and ptr %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -647,7 +647,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
-  %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -693,7 +693,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
-  %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -732,7 +732,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -778,7 +778,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -826,7 +826,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
-  %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -877,7 +877,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
-  %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -911,7 +911,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile sub ptr %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -952,7 +952,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile sub ptr %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -995,7 +995,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
-  %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -1041,7 +1041,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
-  %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -2440,7 +2440,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw volatile or ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -2486,7 +2486,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw volatile or ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -2534,7 +2534,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
-  %tmp0 = atomicrmw volatile or ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -2585,7 +2585,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
-  %tmp0 = atomicrmw volatile or ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -2619,7 +2619,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile or ptr %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -2660,7 +2660,7 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile or ptr %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -2703,7 +2703,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN2-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
-  %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -2749,7 +2749,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
 ; GCN2-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
-  %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -2788,7 +2788,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -2826,7 +2826,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr double, ptr %out, i64 4
-  %tmp0 = atomicrmw volatile xchg ptr %gep, double %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr %gep, double %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -2864,7 +2864,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr ptr, ptr %out, i32 4
-  %val = atomicrmw volatile xchg ptr %gep, ptr %in seq_cst
+  %val = atomicrmw volatile xchg ptr %gep, ptr %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -2910,7 +2910,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -2958,7 +2958,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
-  %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3009,7 +3009,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
-  %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -3043,7 +3043,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3084,7 +3084,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -3127,7 +3127,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
-  %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3173,7 +3173,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
-  %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -3212,7 +3212,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3258,7 +3258,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -3306,7 +3306,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
-  %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3357,7 +3357,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
-  %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -3391,7 +3391,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile xor ptr %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3432,7 +3432,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile xor ptr %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -3475,7 +3475,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
-  %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3521,7 +3521,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
-  %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -3598,7 +3598,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %val = load atomic i64, ptr %in seq_cst, align 8
+  %val = load atomic i64, ptr %in syncscope("agent") seq_cst, align 8
   store i64 %val, ptr %out
   ret void
 }
@@ -3882,7 +3882,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
-  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   ret void
 }
 
@@ -3926,7 +3926,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 9000
-  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   ret void
 }
 
@@ -3974,7 +3974,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
-  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
   store i64 %extract0, ptr %out2
   ret void
@@ -4025,7 +4025,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
-  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   ret void
 }
 
@@ -4082,7 +4082,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
-  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
   store i64 %extract0, ptr %out2
   ret void
@@ -4123,7 +4123,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %val = cmpxchg volatile ptr %out, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   ret void
 }
 
@@ -4166,7 +4166,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in,
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %val = cmpxchg volatile ptr %out, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
   store i64 %extract0, ptr %out2
   ret void
@@ -4212,7 +4212,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
 ; GCN2-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
-  %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   ret void
 }
 
@@ -4264,7 +4264,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
 ; GCN2-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
-  %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
   store i64 %extract0, ptr %out2
   ret void
@@ -4342,7 +4342,7 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %val = load atomic double, ptr %in seq_cst, align 8
+  %val = load atomic double, ptr %in syncscope("agent") seq_cst, align 8
   store double %val, ptr %out
   ret void
 }
@@ -4620,7 +4620,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4666,7 +4666,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -4714,7 +4714,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
-  %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4765,7 +4765,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
-  %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -4799,7 +4799,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4840,7 +4840,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -4883,7 +4883,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
-  %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4929,7 +4929,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
-  %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -4968,7 +4968,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -5014,7 +5014,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
-  %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -5062,7 +5062,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
-  %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -5113,7 +5113,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
   %gep = getelementptr i64, ptr %ptr, i64 4
-  %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -5147,7 +5147,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -5188,7 +5188,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }
@@ -5231,7 +5231,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
-  %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -5277,7 +5277,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
-  %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr %out2
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
index 91f23b0e9eeb58..7d772de1e164ec 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
@@ -35,7 +35,7 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %unused = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data seq_cst
+  %unused = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -71,7 +71,7 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_2047(ptr addrspace(1)
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2047
-  %unused = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data seq_cst
+  %unused = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -107,7 +107,7 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_neg2048(ptr addrspace(
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048
-  %unused = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data seq_cst
+  %unused = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -141,7 +141,7 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %rtn = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data seq_cst
+  %rtn = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -181,7 +181,7 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn_2048(ptr addrspace(1) inreg %s
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2048
-  %rtn = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data seq_cst
+  %rtn = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -217,7 +217,7 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn_neg2048(ptr addrspace(1) inreg
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048
-  %rtn = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data seq_cst
+  %rtn = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -276,7 +276,7 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset,
   %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %rtn = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data seq_cst
+  %rtn = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -330,7 +330,7 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 42
-  %rtn = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data seq_cst
+  %rtn = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -383,7 +383,7 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset
   %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %unused = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data seq_cst
+  %unused = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -436,7 +436,7 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i3
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 42
-  %unused = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data seq_cst
+  %unused = atomicrmw xchg ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -478,7 +478,7 @@ define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn(ptr addrspace(1) inreg %
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %rtn = atomicrmw xchg ptr addrspace(1) %gep0, i64 %data seq_cst
+  %rtn = atomicrmw xchg ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
@@ -514,7 +514,7 @@ define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn_neg128(ptr addrspace(1)
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %rtn = atomicrmw xchg ptr addrspace(1) %gep1, i64 %data seq_cst
+  %rtn = atomicrmw xchg ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
@@ -549,7 +549,7 @@ define amdgpu_ps void @global_xchg_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %unused = atomicrmw xchg ptr addrspace(1) %gep0, i64 %data seq_cst
+  %unused = atomicrmw xchg ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -584,7 +584,7 @@ define amdgpu_ps void @global_xchg_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %unused = atomicrmw xchg ptr addrspace(1) %gep1, i64 %data seq_cst
+  %unused = atomicrmw xchg ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -622,7 +622,7 @@ define amdgpu_ps float @global_add_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %rtn = atomicrmw add ptr addrspace(1) %gep0, i32 %data seq_cst
+  %rtn = atomicrmw add ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -658,7 +658,7 @@ define amdgpu_ps float @global_add_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %rtn = atomicrmw add ptr addrspace(1) %gep1, i32 %data seq_cst
+  %rtn = atomicrmw add ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -693,7 +693,7 @@ define amdgpu_ps void @global_add_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %unused = atomicrmw add ptr addrspace(1) %gep0, i32 %data seq_cst
+  %unused = atomicrmw add ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -728,7 +728,7 @@ define amdgpu_ps void @global_add_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %unused = atomicrmw add ptr addrspace(1) %gep1, i32 %data seq_cst
+  %unused = atomicrmw add ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -762,7 +762,7 @@ define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %rtn = atomicrmw add ptr addrspace(1) %gep0, i64 %data seq_cst
+  %rtn = atomicrmw add ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
@@ -798,7 +798,7 @@ define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn_neg128(ptr addrspace(1) i
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %rtn = atomicrmw add ptr addrspace(1) %gep1, i64 %data seq_cst
+  %rtn = atomicrmw add ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
@@ -833,7 +833,7 @@ define amdgpu_ps void @global_add_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %unused = atomicrmw add ptr addrspace(1) %gep0, i64 %data seq_cst
+  %unused = atomicrmw add ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -868,7 +868,7 @@ define amdgpu_ps void @global_add_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %unused = atomicrmw add ptr addrspace(1) %gep1, i64 %data seq_cst
+  %unused = atomicrmw add ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -906,7 +906,7 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %rtn = atomicrmw sub ptr addrspace(1) %gep0, i32 %data seq_cst
+  %rtn = atomicrmw sub ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -942,7 +942,7 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %rtn = atomicrmw sub ptr addrspace(1) %gep1, i32 %data seq_cst
+  %rtn = atomicrmw sub ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -977,7 +977,7 @@ define amdgpu_ps void @global_sub_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %unused = atomicrmw sub ptr addrspace(1) %gep0, i32 %data seq_cst
+  %unused = atomicrmw sub ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -1012,7 +1012,7 @@ define amdgpu_ps void @global_sub_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %unused = atomicrmw sub ptr addrspace(1) %gep1, i32 %data seq_cst
+  %unused = atomicrmw sub ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -1046,7 +1046,7 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %rtn = atomicrmw sub ptr addrspace(1) %gep0, i64 %data seq_cst
+  %rtn = atomicrmw sub ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
@@ -1082,7 +1082,7 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn_neg128(ptr addrspace(1) i
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %rtn = atomicrmw sub ptr addrspace(1) %gep1, i64 %data seq_cst
+  %rtn = atomicrmw sub ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
@@ -1117,7 +1117,7 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %unused = atomicrmw sub ptr addrspace(1) %gep0, i64 %data seq_cst
+  %unused = atomicrmw sub ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -1152,7 +1152,7 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %unused = atomicrmw sub ptr addrspace(1) %gep1, i64 %data seq_cst
+  %unused = atomicrmw sub ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -1190,7 +1190,7 @@ define amdgpu_ps float @global_and_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %rtn = atomicrmw and ptr addrspace(1) %gep0, i32 %data seq_cst
+  %rtn = atomicrmw and ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -1226,7 +1226,7 @@ define amdgpu_ps float @global_and_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %rtn = atomicrmw and ptr addrspace(1) %gep1, i32 %data seq_cst
+  %rtn = atomicrmw and ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -1261,7 +1261,7 @@ define amdgpu_ps void @global_and_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %unused = atomicrmw and ptr addrspace(1) %gep0, i32 %data seq_cst
+  %unused = atomicrmw and ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -1296,7 +1296,7 @@ define amdgpu_ps void @global_and_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %unused = atomicrmw and ptr addrspace(1) %gep1, i32 %data seq_cst
+  %unused = atomicrmw and ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -1330,7 +1330,7 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %rtn = atomicrmw and ptr addrspace(1) %gep0, i64 %data seq_cst
+  %rtn = atomicrmw and ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
@@ -1366,7 +1366,7 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn_neg128(ptr addrspace(1) i
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %rtn = atomicrmw and ptr addrspace(1) %gep1, i64 %data seq_cst
+  %rtn = atomicrmw and ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
@@ -1401,7 +1401,7 @@ define amdgpu_ps void @global_and_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %unused = atomicrmw and ptr addrspace(1) %gep0, i64 %data seq_cst
+  %unused = atomicrmw and ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -1436,7 +1436,7 @@ define amdgpu_ps void @global_and_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %unused = atomicrmw and ptr addrspace(1) %gep1, i64 %data seq_cst
+  %unused = atomicrmw and ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -1474,7 +1474,7 @@ define amdgpu_ps float @global_or_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %rtn = atomicrmw or ptr addrspace(1) %gep0, i32 %data seq_cst
+  %rtn = atomicrmw or ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -1510,7 +1510,7 @@ define amdgpu_ps float @global_or_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %s
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %rtn = atomicrmw or ptr addrspace(1) %gep1, i32 %data seq_cst
+  %rtn = atomicrmw or ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -1545,7 +1545,7 @@ define amdgpu_ps void @global_or_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %unused = atomicrmw or ptr addrspace(1) %gep0, i32 %data seq_cst
+  %unused = atomicrmw or ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -1580,7 +1580,7 @@ define amdgpu_ps void @global_or_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %unused = atomicrmw or ptr addrspace(1) %gep1, i32 %data seq_cst
+  %unused = atomicrmw or ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -1614,7 +1614,7 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn(ptr addrspace(1) inreg %sb
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %rtn = atomicrmw or ptr addrspace(1) %gep0, i64 %data seq_cst
+  %rtn = atomicrmw or ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
@@ -1650,7 +1650,7 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn_neg128(ptr addrspace(1) in
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %rtn = atomicrmw or ptr addrspace(1) %gep1, i64 %data seq_cst
+  %rtn = atomicrmw or ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
@@ -1685,7 +1685,7 @@ define amdgpu_ps void @global_or_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %unused = atomicrmw or ptr addrspace(1) %gep0, i64 %data seq_cst
+  %unused = atomicrmw or ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -1720,7 +1720,7 @@ define amdgpu_ps void @global_or_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %unused = atomicrmw or ptr addrspace(1) %gep1, i64 %data seq_cst
+  %unused = atomicrmw or ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -1758,7 +1758,7 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %rtn = atomicrmw xor ptr addrspace(1) %gep0, i32 %data seq_cst
+  %rtn = atomicrmw xor ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -1794,7 +1794,7 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %rtn = atomicrmw xor ptr addrspace(1) %gep1, i32 %data seq_cst
+  %rtn = atomicrmw xor ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -1829,7 +1829,7 @@ define amdgpu_ps void @global_xor_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %unused = atomicrmw xor ptr addrspace(1) %gep0, i32 %data seq_cst
+  %unused = atomicrmw xor ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -1864,7 +1864,7 @@ define amdgpu_ps void @global_xor_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %unused = atomicrmw xor ptr addrspace(1) %gep1, i32 %data seq_cst
+  %unused = atomicrmw xor ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -1898,7 +1898,7 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %rtn = atomicrmw xor ptr addrspace(1) %gep0, i64 %data seq_cst
+  %rtn = atomicrmw xor ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
@@ -1934,7 +1934,7 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn_neg128(ptr addrspace(1) i
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %rtn = atomicrmw xor ptr addrspace(1) %gep1, i64 %data seq_cst
+  %rtn = atomicrmw xor ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
@@ -1969,7 +1969,7 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %unused = atomicrmw xor ptr addrspace(1) %gep0, i64 %data seq_cst
+  %unused = atomicrmw xor ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -2004,7 +2004,7 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %unused = atomicrmw xor ptr addrspace(1) %gep1, i64 %data seq_cst
+  %unused = atomicrmw xor ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
   ret void
 }
 
@@ -3374,7 +3374,7 @@ define amdgpu_ps float @global_inc_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data monotonic
+  %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -3394,7 +3394,7 @@ define amdgpu_ps float @global_inc_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i32 %data monotonic
+  %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -3413,7 +3413,7 @@ define amdgpu_ps void @global_inc_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data monotonic
+  %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic
   ret void
 }
 
@@ -3432,7 +3432,7 @@ define amdgpu_ps void @global_inc_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i32 %data monotonic
+  %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic
   ret void
 }
 
@@ -3450,7 +3450,7 @@ define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data monotonic
+  %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
@@ -3470,7 +3470,7 @@ define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn_neg128(ptr addrspace(1) i
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i64 %data monotonic
+  %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
@@ -3489,7 +3489,7 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data monotonic
+  %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic
   ret void
 }
 
@@ -3508,7 +3508,7 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i64 %data monotonic
+  %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic
   ret void
 }
 
@@ -3531,7 +3531,7 @@ define amdgpu_ps float @global_dec_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data monotonic
+  %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -3551,7 +3551,7 @@ define amdgpu_ps float @global_dec_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep1, i32 %data monotonic
+  %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
@@ -3570,7 +3570,7 @@ define amdgpu_ps void @global_dec_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data monotonic
+  %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic
   ret void
 }
 
@@ -3589,7 +3589,7 @@ define amdgpu_ps void @global_dec_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %unused = atomicrmw udec_wrap ptr addrspace(1) %gep1, i32 %data monotonic
+  %unused = atomicrmw udec_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic
   ret void
 }
 
@@ -3607,7 +3607,7 @@ define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn(ptr addrspace(1) inreg %s
 ; GFX11-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data monotonic
+  %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
@@ -3627,7 +3627,7 @@ define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn_neg128(ptr addrspace(1) i
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep1, i64 %data monotonic
+  %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic
   %cast.rtn = bitcast i64 %rtn to <2 x float>
   ret <2 x float> %cast.rtn
 }
@@ -3646,7 +3646,7 @@ define amdgpu_ps void @global_dec_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
 ; GFX11-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
-  %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data monotonic
+  %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic
   ret void
 }
 
@@ -3665,7 +3665,7 @@ define amdgpu_ps void @global_dec_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
-  %unused = atomicrmw udec_wrap ptr addrspace(1) %gep1, i64 %data monotonic
+  %unused = atomicrmw udec_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 8dab061aab4789..08e06d4dd015a5 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -46,7 +46,7 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in)
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -97,7 +97,7 @@ define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out,
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024
-  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -146,7 +146,7 @@ define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 9000
-  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -200,7 +200,7 @@ define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595
 
-  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -258,7 +258,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -321,7 +321,7 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
-  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -394,7 +394,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
-  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -441,7 +441,7 @@ define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -498,7 +498,7 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -558,7 +558,7 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
-  %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -628,7 +628,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
-  %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -676,7 +676,7 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in)
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -734,7 +734,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -797,7 +797,7 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
-  %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -870,7 +870,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
-  %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -917,7 +917,7 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst
+  %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -974,7 +974,7 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst
+  %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -1034,7 +1034,7 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
-  %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -1104,7 +1104,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
-  %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -1152,7 +1152,7 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in)
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -1210,7 +1210,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -1273,7 +1273,7 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
-  %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -1346,7 +1346,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
-  %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -1393,7 +1393,7 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst
+  %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -1450,7 +1450,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst
+  %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -1510,7 +1510,7 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
-  %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -1580,7 +1580,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
-  %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -1588,90 +1588,47 @@ entry:
 define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_max_i32_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dword s3, s[4:5], 0x4
-; SI-NEXT:    s_mov_b64 s[0:1], 0
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:  .LBB27_1: ; %atomicrmw.start
-; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_max_i32_e32 v0, s2, v1
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    buffer_atomic_smax v0, off, s[0:3], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v1, v2
-; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; SI-NEXT:    s_cbranch_execnz .LBB27_1
-; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: atomic_max_i32_offset:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
-; VI-NEXT:    s_load_dword s6, s[0:1], 0x2c
-; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s2, s[8:9], 0x10
-; VI-NEXT:    s_add_u32 s0, s8, 16
-; VI-NEXT:    s_addc_u32 s1, s9, 0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:  .LBB27_1: ; %atomicrmw.start
-; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    v_max_i32_e32 v0, s6, v1
-; VI-NEXT:    v_mov_b32_e32 v3, v1
-; VI-NEXT:    v_mov_b32_e32 v2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; VI-NEXT:    buffer_atomic_smax v0, off, s[0:3], 0 offset:16
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT:    v_mov_b32_e32 v1, v2
-; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT:    s_cbranch_execnz .LBB27_1
-; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: atomic_max_i32_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x10
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:  .LBB27_1: ; %atomicrmw.start
-; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_max_i32_e32 v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
+; GFX9-NEXT:    global_atomic_smax v0, v1, s[2:3] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT:    s_cbranch_execnz .LBB27_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3437,7 +3394,7 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in)
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3495,7 +3452,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr addrspace(1) %out, ptr a
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -3558,7 +3515,7 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
-  %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3631,7 +3588,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
-  %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -3678,7 +3635,7 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in seq_cst
+  %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3735,7 +3692,7 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in seq_cst
+  %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -3795,7 +3752,7 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
-  %val = atomicrmw volatile or ptr addrspace(1) %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile or ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3865,7 +3822,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
-  %val = atomicrmw volatile or ptr addrspace(1) %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile or ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -3913,7 +3870,7 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3960,7 +3917,7 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float %
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr float, ptr addrspace(1) %out, i64 4
-  %val = atomicrmw volatile xchg ptr addrspace(1) %gep, float %in seq_cst
+  %val = atomicrmw volatile xchg ptr addrspace(1) %gep, float %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4018,7 +3975,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -4081,7 +4038,7 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out,
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
-  %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4154,7 +4111,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
-  %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -4201,7 +4158,7 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4258,7 +4215,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrsp
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
+  %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -4318,7 +4275,7 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
-  %val = atomicrmw volatile xchg ptr addrspace(1) %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile xchg ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4388,7 +4345,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
-  %val = atomicrmw volatile xchg ptr addrspace(1) %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile xchg ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -4440,7 +4397,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   ret void
 }
 
@@ -4501,7 +4458,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out,
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   %extract0 = extractvalue { i32, i1 } %val, 0
   store i32 %extract0, ptr addrspace(1) %out2
   ret void
@@ -4571,7 +4528,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
-  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   ret void
 }
 
@@ -4650,7 +4607,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1)
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
-  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   %extract0 = extractvalue { i32, i1 } %val, 0
   store i32 %extract0, ptr addrspace(1) %out2
   ret void
@@ -4702,7 +4659,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = cmpxchg volatile ptr addrspace(1) %out, i32 %old, i32 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   ret void
 }
 
@@ -4762,7 +4719,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr add
 ; GFX9-NEXT:    global_store_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = cmpxchg volatile ptr addrspace(1) %out, i32 %old, i32 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   %extract0 = extractvalue { i32, i1 } %val, 0
   store i32 %extract0, ptr addrspace(1) %out2
   ret void
@@ -4829,7 +4786,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
-  %val = cmpxchg volatile ptr addrspace(1) %ptr, i32 %old, i32 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   ret void
 }
 
@@ -4905,7 +4862,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out,
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
-  %val = cmpxchg volatile ptr addrspace(1) %ptr, i32 %old, i32 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
   %extract0 = extractvalue { i32, i1 } %val, 0
   store i32 %extract0, ptr addrspace(1) %out2
   ret void
@@ -4954,7 +4911,7 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in)
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -5012,7 +4969,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -5075,7 +5032,7 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
-  %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -5148,7 +5105,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
-  %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -5195,7 +5152,7 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in seq_cst
+  %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -5252,7 +5209,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in seq_cst
+  %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -5312,7 +5269,7 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in,
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
-  %val = atomicrmw volatile xor ptr addrspace(1) %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile xor ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -5382,7 +5339,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
-  %val = atomicrmw volatile xor ptr addrspace(1) %ptr, i32 %in seq_cst
+  %val = atomicrmw volatile xor ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -5597,7 +5554,7 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = load atomic i32, ptr addrspace(1) %in seq_cst, align 4
+  %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4
   store i32 %val, ptr addrspace(1) %out
   ret void
 }
@@ -6642,7 +6599,7 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in)
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -6693,7 +6650,7 @@ define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out,
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024
-  %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -6742,7 +6699,7 @@ define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 9000
-  %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -6795,7 +6752,7 @@ define amdgpu_kernel void @atomic_inc_i32_huge_offset(ptr addrspace(1) %out, i32
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595
-  %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -6853,7 +6810,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -6916,7 +6873,7 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
-  %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -6989,7 +6946,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
-  %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -7037,7 +6994,7 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in)
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -7088,7 +7045,7 @@ define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out,
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024
-  %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -7137,7 +7094,7 @@ define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 9000
-  %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -7190,7 +7147,7 @@ define amdgpu_kernel void @atomic_dec_i32_huge_offset(ptr addrspace(1) %out, i32
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595
-  %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -7248,7 +7205,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
-  %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
@@ -7311,7 +7268,7 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
-  %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -7384,7 +7341,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou
 entry:
   %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
-  %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
+  %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index 651ada15ffcc3b..889139450ca2f1 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -46,7 +46,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -107,7 +107,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -170,7 +170,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
-  %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -240,7 +240,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
-  %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -291,7 +291,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile add ptr addrspace(1) %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile add ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -351,7 +351,7 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile add ptr addrspace(1) %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile add ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -411,7 +411,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
-  %tmp0 = atomicrmw volatile add ptr addrspace(1) %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile add ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -478,7 +478,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
-  %tmp0 = atomicrmw volatile add ptr addrspace(1) %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile add ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -526,7 +526,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -587,7 +587,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -650,7 +650,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
-  %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -720,7 +720,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
-  %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -771,7 +771,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -831,7 +831,7 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -891,7 +891,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
-  %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -958,7 +958,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
-  %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -1006,7 +1006,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -1067,7 +1067,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -1130,7 +1130,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
-  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -1200,7 +1200,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
-  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -1251,7 +1251,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -1311,7 +1311,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -1371,7 +1371,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
-  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -1438,7 +1438,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
-  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -3262,7 +3262,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3323,7 +3323,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -3386,7 +3386,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
-  %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3456,7 +3456,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
-  %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -3507,7 +3507,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3567,7 +3567,7 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -3627,7 +3627,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
-  %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3694,7 +3694,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
-  %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -3742,7 +3742,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3789,7 +3789,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr double, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, double %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, double %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3836,7 +3836,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr ptr, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, ptr %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, ptr %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -3897,7 +3897,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -3960,7 +3960,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out,
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
-  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4030,7 +4030,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
-  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -4081,7 +4081,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4141,7 +4141,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -4201,7 +4201,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
-  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4268,7 +4268,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
-  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -4316,7 +4316,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4377,7 +4377,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -4440,7 +4440,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
-  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4510,7 +4510,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
-  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -4561,7 +4561,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4621,7 +4621,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -4681,7 +4681,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in,
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
-  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -4748,7 +4748,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
-  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -4809,7 +4809,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   ret void
 }
 
@@ -4871,7 +4871,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 9000
-  %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   ret void
 }
 
@@ -4935,7 +4935,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out,
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
   store i64 %extract0, ptr addrspace(1) %out2
   ret void
@@ -5003,7 +5003,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
-  %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   ret void
 }
 
@@ -5082,7 +5082,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
-  %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
   store i64 %extract0, ptr addrspace(1) %out2
   ret void
@@ -5143,7 +5143,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i6
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = cmpxchg volatile ptr addrspace(1) %out, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   ret void
 }
 
@@ -5206,7 +5206,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = cmpxchg volatile ptr addrspace(1) %out, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
   store i64 %extract0, ptr addrspace(1) %out2
   ret void
@@ -5271,7 +5271,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
-  %val = cmpxchg volatile ptr addrspace(1) %ptr, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   ret void
 }
 
@@ -5347,7 +5347,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
 ; GFX9-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
-  %val = cmpxchg volatile ptr addrspace(1) %ptr, i64 %old, i64 %in seq_cst seq_cst
+  %val = cmpxchg volatile ptr addrspace(1) %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
   store i64 %extract0, ptr addrspace(1) %out2
   ret void
@@ -5509,7 +5509,7 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 entry:
-  %val = load atomic i64, ptr addrspace(1) %in seq_cst, align 8
+  %val = load atomic i64, ptr addrspace(1) %in syncscope("agent") seq_cst, align 8
   store i64 %val, ptr addrspace(1) %out
   ret void
 }
@@ -6013,7 +6013,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -6074,7 +6074,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -6137,7 +6137,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
-  %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -6184,7 +6184,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }
 
@@ -6245,7 +6245,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
-  %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   store i64 %tmp0, ptr addrspace(1) %out2
   ret void
 }
@@ -6308,6 +6308,6 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i
 entry:
   %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
   %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
-  %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst
+  %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/mubuf.ll b/llvm/test/CodeGen/AMDGPU/mubuf.ll
index a97061b8cc5e38..37bd7bc92c2ddf 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf.ll
@@ -159,7 +159,7 @@ define amdgpu_kernel void @store_sgpr_ptr_large_offset(ptr addrspace(1) %out) {
 ; CHECK: buffer_atomic_add v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
 define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(ptr addrspace(1) %out) {
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 32768
-  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 5 seq_cst
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 5 syncscope("agent") seq_cst
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
index d6d7a80ece38ad..0d9de8b33397b4 100644
--- a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
@@ -524,7 +524,7 @@ entry:
 define protected amdgpu_kernel void @may_alias_atomic_rmw(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; CHECK-LABEL: @may_alias_atomic_rmw(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[UNUSED:%.*]] = atomicrmw add ptr addrspace(1) [[OUT:%.*]], i32 5 seq_cst, align 4
+; CHECK-NEXT:    [[UNUSED:%.*]] = atomicrmw add ptr addrspace(1) [[OUT:%.*]], i32 5 syncscope("agent") seq_cst, align 4
 ; CHECK-NEXT:    fence syncscope("workgroup") release
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
 ; CHECK-NEXT:    fence syncscope("workgroup") acquire
@@ -533,7 +533,7 @@ define protected amdgpu_kernel void @may_alias_atomic_rmw(ptr addrspace(1) %in,
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %unused = atomicrmw add ptr addrspace(1) %out, i32 5 seq_cst
+  %unused = atomicrmw add ptr addrspace(1) %out, i32 5 syncscope("agent") seq_cst
   fence syncscope("workgroup") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("workgroup") acquire

diff  --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
index 30e79922935964..46ea7266f28254 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
@@ -12,7 +12,7 @@ define void @shl_base_atomicrmw_global_ptr(ptr addrspace(1) %out, ptr addrspace(
   %cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64
   %shl = shl i64 %cast, 2
   %castback = inttoptr i64 %shl to ptr addrspace(1)
-  %val = atomicrmw and ptr addrspace(1) %castback, i32 3 seq_cst
+  %val = atomicrmw and ptr addrspace(1) %castback, i32 3 syncscope("agent") seq_cst
   store volatile i64 %cast, ptr addrspace(1) %extra.use, align 4
   ret void
 }


        


More information about the llvm-commits mailing list