[llvm] 2a95022 - AMDGPU: Add atomic bfloat load/store codegen tests

Thu Apr 25 07:08:26 PDT 2024

Author: Matt Arsenault
Date: 2024-04-25T16:08:11+02:00
New Revision: 2a95022cff38dc0978f527ae580b5720eb9e4d98

URL: https://github.com/llvm/llvm-project/commit/2a95022cff38dc0978f527ae580b5720eb9e4d98
DIFF: https://github.com/llvm/llvm-project/commit/2a95022cff38dc0978f527ae580b5720eb9e4d98.diff

LOG: AMDGPU: Add atomic bfloat load/store codegen tests

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
    llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
    llvm/test/CodeGen/AMDGPU/flat_atomics.ll
    llvm/test/CodeGen/AMDGPU/global_atomics.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
index bfd18f1b52a51b..a3b6c283512f3e 100644

--- a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
@@ -152,3 +152,57 @@ define ptr addrspace(3) @atomic_load_monotonic_p3i8_offset(ptr addrspace(3) %ptr
   %load = load atomic ptr addrspace(3), ptr addrspace(3) %gep monotonic, align 4
   ret ptr addrspace(3) %load
 }
+
+; GCN-LABEL: {{^}}atomic_load_monotonic_f16:
+; GCN: s_waitcnt
+; GFX9-NOT: s_mov_b32 m0
+; CI-NEXT: s_mov_b32 m0
+; GCN-NEXT: ds_read_u16 v0, v0{{$}}
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define i16 @atomic_load_monotonic_f16(ptr addrspace(3) %ptr) {
+  %load = load atomic half, ptr addrspace(3) %ptr monotonic, align 2
+  %ret = bitcast half %load to i16
+  ret i16 %ret
+}
+
+; GCN-LABEL: {{^}}atomic_load_monotonic_f16_offset:
+; GCN: s_waitcnt
+; GFX9-NOT: s_mov_b32 m0
+; CI-NEXT: s_mov_b32 m0
+; GCN-NEXT: ds_read_u16 v0, v0 offset:32{{$}}
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define i16 @atomic_load_monotonic_f16_offset(ptr addrspace(3) %ptr) {
+  %gep = getelementptr inbounds half, ptr addrspace(3) %ptr, i32 16
+  %load = load atomic half, ptr addrspace(3) %gep monotonic, align 2
+  %ret = bitcast half %load to i16
+  ret i16 %ret
+}
+
+; GCN-LABEL: {{^}}atomic_load_monotonic_bf16:
+; GCN: s_waitcnt
+; GFX9-NOT: s_mov_b32 m0
+; CI-NEXT: s_mov_b32 m0
+; GCN-NEXT: ds_read_u16 v0, v0{{$}}
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define i16 @atomic_load_monotonic_bf16(ptr addrspace(3) %ptr) {
+  %load = load atomic bfloat, ptr addrspace(3) %ptr monotonic, align 2
+  %ret = bitcast bfloat %load to i16
+  ret i16 %ret
+}
+
+; GCN-LABEL: {{^}}atomic_load_monotonic_bf16_offset:
+; GCN: s_waitcnt
+; GFX9-NOT: s_mov_b32 m0
+; CI-NEXT: s_mov_b32 m0
+; GCN-NEXT: ds_read_u16 v0, v0 offset:32{{$}}
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define i16 @atomic_load_monotonic_bf16_offset(ptr addrspace(3) %ptr) {
+  %gep = getelementptr inbounds bfloat, ptr addrspace(3) %ptr, i32 16
+  %load = load atomic bfloat, ptr addrspace(3) %gep monotonic, align 2
+  %ret = bitcast bfloat %load to i16
+  ret i16 %ret
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
index 71e24c1692c7f4..cd1e1fb1add473 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
@@ -101,3 +101,56 @@ define void @atomic_store_monotonic_offset_i64(ptr addrspace(3) %ptr, i64 %val)
   ret void
 }
 
+; GCN-LABEL: {{^}}atomic_store_monotonic_f16:
+; GCN: s_waitcnt
+; GFX9-NOT: s_mov_b32 m0
+; CI-NEXT: s_mov_b32 m0
+; GCN-NEXT: ds_write_b16 v0, v1{{$}}
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define void @atomic_store_monotonic_f16(ptr addrspace(3) %ptr, i16 %arg.val) {
+  %val = bitcast i16 %arg.val to half
+  store atomic half %val, ptr addrspace(3) %ptr monotonic, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_monotonic_offset_f16:
+; GCN: s_waitcnt
+; GFX9-NOT: s_mov_b32 m0
+; CI-NEXT: s_mov_b32 m0
+; GCN-NEXT: ds_write_b16 v0, v1 offset:32{{$}}
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define void @atomic_store_monotonic_offset_f16(ptr addrspace(3) %ptr, i16 %arg.val) {
+  %val = bitcast i16 %arg.val to half
+  %gep = getelementptr inbounds half, ptr addrspace(3) %ptr, i32 16
+  store atomic half %val, ptr addrspace(3) %gep monotonic, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_monotonic_bf16:
+; GCN: s_waitcnt
+; GFX9-NOT: s_mov_b32 m0
+; CI-NEXT: s_mov_b32 m0
+; GCN-NEXT: ds_write_b16 v0, v1{{$}}
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define void @atomic_store_monotonic_bf16(ptr addrspace(3) %ptr, i16 %arg.val) {
+  %val = bitcast i16 %arg.val to bfloat
+  store atomic bfloat %val, ptr addrspace(3) %ptr monotonic, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}atomic_store_monotonic_offset_bf16:
+; GCN: s_waitcnt
+; GFX9-NOT: s_mov_b32 m0
+; CI-NEXT: s_mov_b32 m0
+; GCN-NEXT: ds_write_b16 v0, v1 offset:32{{$}}
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define void @atomic_store_monotonic_offset_bf16(ptr addrspace(3) %ptr, i16 %arg.val) {
+  %val = bitcast i16 %arg.val to bfloat
+  %gep = getelementptr inbounds bfloat, ptr addrspace(3) %ptr, i32 16
+  store atomic bfloat %val, ptr addrspace(3) %gep monotonic, align 2
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index 06ba60518adc04..e44572985e6d2e 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -6741,6 +6741,81 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) {
+; GCN1-LABEL: atomic_store_bf16_offset:
+; GCN1:       ; %bb.0:
+; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
+; GCN1-NEXT:    s_load_dword s0, s[0:1], 0x9
+; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN1-NEXT:    v_mov_b32_e32 v0, s2
+; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    v_mov_b32_e32 v2, s0
+; GCN1-NEXT:    flat_store_short v[0:1], v2
+; GCN1-NEXT:    s_endpgm
+;
+; GCN2-LABEL: atomic_store_bf16_offset:
+; GCN2:       ; %bb.0:
+; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GCN2-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN2-NEXT:    v_mov_b32_e32 v0, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    v_mov_b32_e32 v2, s0
+; GCN2-NEXT:    flat_store_short v[0:1], v2
+; GCN2-NEXT:    s_endpgm
+;
+; GCN3-LABEL: atomic_store_bf16_offset:
+; GCN3:       ; %bb.0:
+; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GCN3-NEXT:    s_load_dword s4, s[0:1], 0x24
+; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v2, s4
+; GCN3-NEXT:    flat_store_short v[0:1], v2
+; GCN3-NEXT:    s_endpgm
+  %gep = getelementptr bfloat, ptr %out, i64 8
+  store atomic bfloat %in, ptr %out seq_cst, align 2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) {
+; GCN1-LABEL: atomic_store_bf16:
+; GCN1:       ; %bb.0:
+; GCN1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
+; GCN1-NEXT:    s_load_dword s0, s[0:1], 0x9
+; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN1-NEXT:    v_mov_b32_e32 v0, s2
+; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    v_mov_b32_e32 v2, s0
+; GCN1-NEXT:    flat_store_short v[0:1], v2
+; GCN1-NEXT:    s_endpgm
+;
+; GCN2-LABEL: atomic_store_bf16:
+; GCN2:       ; %bb.0:
+; GCN2-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GCN2-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN2-NEXT:    v_mov_b32_e32 v0, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    v_mov_b32_e32 v2, s0
+; GCN2-NEXT:    flat_store_short v[0:1], v2
+; GCN2-NEXT:    s_endpgm
+;
+; GCN3-LABEL: atomic_store_bf16:
+; GCN3:       ; %bb.0:
+; GCN3-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GCN3-NEXT:    s_load_dword s4, s[0:1], 0x24
+; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    v_mov_b32_e32 v2, s4
+; GCN3-NEXT:    flat_store_short v[0:1], v2
+; GCN3-NEXT:    s_endpgm
+  store atomic bfloat %in, ptr %out seq_cst, align 2
+  ret void
+}
+
 define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_inc_i32_offset:
 ; GCN1:       ; %bb.0: ; %entry
@@ -7868,3 +7943,201 @@ entry:
   store i32 %val, ptr %out2
   ret void
 }
+
+define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) {
+; GCN1-LABEL: atomic_load_f16_offset:
+; GCN1:       ; %bb.0:
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN1-NEXT:    s_add_u32 s0, s0, 16
+; GCN1-NEXT:    s_addc_u32 s1, s1, 0
+; GCN1-NEXT:    v_mov_b32_e32 v0, s0
+; GCN1-NEXT:    v_mov_b32_e32 v1, s1
+; GCN1-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    v_mov_b32_e32 v0, s2
+; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    flat_store_short v[0:1], v2
+; GCN1-NEXT:    s_endpgm
+;
+; GCN2-LABEL: atomic_load_f16_offset:
+; GCN2:       ; %bb.0:
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN2-NEXT:    s_add_u32 s0, s0, 16
+; GCN2-NEXT:    s_addc_u32 s1, s1, 0
+; GCN2-NEXT:    v_mov_b32_e32 v0, s0
+; GCN2-NEXT:    v_mov_b32_e32 v1, s1
+; GCN2-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    v_mov_b32_e32 v0, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    flat_store_short v[0:1], v2
+; GCN2-NEXT:    s_endpgm
+;
+; GCN3-LABEL: atomic_load_f16_offset:
+; GCN3:       ; %bb.0:
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    flat_load_ushort v2, v[0:1] offset:16 glc
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    flat_store_short v[0:1], v2
+; GCN3-NEXT:    s_endpgm
+  %gep = getelementptr half, ptr %in, i64 8
+  %val = load atomic half, ptr %gep  seq_cst, align 2
+  store half %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) {
+; GCN1-LABEL: atomic_load_f16:
+; GCN1:       ; %bb.0:
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN1-NEXT:    v_mov_b32_e32 v0, s0
+; GCN1-NEXT:    v_mov_b32_e32 v1, s1
+; GCN1-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    v_mov_b32_e32 v0, s2
+; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    flat_store_short v[0:1], v2
+; GCN1-NEXT:    s_endpgm
+;
+; GCN2-LABEL: atomic_load_f16:
+; GCN2:       ; %bb.0:
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN2-NEXT:    v_mov_b32_e32 v0, s0
+; GCN2-NEXT:    v_mov_b32_e32 v1, s1
+; GCN2-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    v_mov_b32_e32 v0, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    flat_store_short v[0:1], v2
+; GCN2-NEXT:    s_endpgm
+;
+; GCN3-LABEL: atomic_load_f16:
+; GCN3:       ; %bb.0:
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    flat_store_short v[0:1], v2
+; GCN3-NEXT:    s_endpgm
+  %val = load atomic half, ptr %in seq_cst, align 2
+  store half %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) {
+; GCN1-LABEL: atomic_load_bf16_offset:
+; GCN1:       ; %bb.0:
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN1-NEXT:    s_add_u32 s0, s0, 16
+; GCN1-NEXT:    s_addc_u32 s1, s1, 0
+; GCN1-NEXT:    v_mov_b32_e32 v0, s0
+; GCN1-NEXT:    v_mov_b32_e32 v1, s1
+; GCN1-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    v_mov_b32_e32 v0, s2
+; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    flat_store_short v[0:1], v2
+; GCN1-NEXT:    s_endpgm
+;
+; GCN2-LABEL: atomic_load_bf16_offset:
+; GCN2:       ; %bb.0:
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN2-NEXT:    s_add_u32 s0, s0, 16
+; GCN2-NEXT:    s_addc_u32 s1, s1, 0
+; GCN2-NEXT:    v_mov_b32_e32 v0, s0
+; GCN2-NEXT:    v_mov_b32_e32 v1, s1
+; GCN2-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    v_mov_b32_e32 v0, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    flat_store_short v[0:1], v2
+; GCN2-NEXT:    s_endpgm
+;
+; GCN3-LABEL: atomic_load_bf16_offset:
+; GCN3:       ; %bb.0:
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    flat_load_ushort v2, v[0:1] offset:16 glc
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    flat_store_short v[0:1], v2
+; GCN3-NEXT:    s_endpgm
+  %gep = getelementptr bfloat, ptr %in, i64 8
+  %val = load atomic bfloat, ptr %gep  seq_cst, align 2
+  store bfloat %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) {
+; GCN1-LABEL: atomic_load_bf16:
+; GCN1:       ; %bb.0:
+; GCN1-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN1-NEXT:    v_mov_b32_e32 v0, s0
+; GCN1-NEXT:    v_mov_b32_e32 v1, s1
+; GCN1-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    v_mov_b32_e32 v0, s2
+; GCN1-NEXT:    v_mov_b32_e32 v1, s3
+; GCN1-NEXT:    flat_store_short v[0:1], v2
+; GCN1-NEXT:    s_endpgm
+;
+; GCN2-LABEL: atomic_load_bf16:
+; GCN2:       ; %bb.0:
+; GCN2-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN2-NEXT:    v_mov_b32_e32 v0, s0
+; GCN2-NEXT:    v_mov_b32_e32 v1, s1
+; GCN2-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    v_mov_b32_e32 v0, s2
+; GCN2-NEXT:    v_mov_b32_e32 v1, s3
+; GCN2-NEXT:    flat_store_short v[0:1], v2
+; GCN2-NEXT:    s_endpgm
+;
+; GCN3-LABEL: atomic_load_bf16:
+; GCN3:       ; %bb.0:
+; GCN3-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN3-NEXT:    v_mov_b32_e32 v0, s0
+; GCN3-NEXT:    v_mov_b32_e32 v1, s1
+; GCN3-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    v_mov_b32_e32 v0, s2
+; GCN3-NEXT:    v_mov_b32_e32 v1, s3
+; GCN3-NEXT:    flat_store_short v[0:1], v2
+; GCN3-NEXT:    s_endpgm
+  %val = load atomic bfloat, ptr %in seq_cst, align 2
+  store bfloat %val, ptr %out
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 674d7a3c5c9b44..dac3a3db7b450b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -6216,6 +6216,81 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) %out) {
+; SI-LABEL: atomic_store_bf16_offset:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:16
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: atomic_store_bf16_offset:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; VI-NEXT:    s_load_dword s4, s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s2, 16
+; VI-NEXT:    s_addc_u32 s1, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_store_bf16_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3] offset:16
+; GFX9-NEXT:    s_endpgm
+  %gep = getelementptr bfloat, ptr addrspace(1) %out, i64 8
+  store atomic bfloat %in, ptr addrspace(1) %gep seq_cst, align 2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) {
+; SI-LABEL: atomic_store_bf16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: atomic_store_bf16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_store_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-NEXT:    s_endpgm
+  store atomic bfloat %in, ptr addrspace(1) %out seq_cst, align 2
+  ret void
+}
+
 define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_inc_i32_offset:
 ; SI:       ; %bb.0: ; %entry
@@ -6963,3 +7038,207 @@ entry:
   store i32 %val, ptr addrspace(1) %out2
   ret void
 }
+
+define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; SI-LABEL: atomic_load_f16_offset:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_mov_b32 s5, s3
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: atomic_load_f16_offset:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s2
+; VI-NEXT:    s_mov_b32 s5, s3
+; VI-NEXT:    s_mov_b32 s2, s6
+; VI-NEXT:    s_mov_b32 s3, s7
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_load_f16_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-NEXT:    s_endpgm
+  %gep = getelementptr half, ptr addrspace(1) %in, i64 8
+  %val = load atomic half, ptr addrspace(1) %gep  seq_cst, align 2
+  store half %val, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; SI-LABEL: atomic_load_f16_negoffset:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_mov_b32 s5, s3
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    v_mov_b32_e32 v0, 0xfffffe00
+; SI-NEXT:    v_mov_b32_e32 v1, -1
+; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: atomic_load_f16_negoffset:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s0, 0xfffffe00
+; VI-NEXT:    s_addc_u32 s1, s1, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_load_ushort v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_mov_b32 s4, s2
+; VI-NEXT:    s_mov_b32 s5, s3
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_load_f16_negoffset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-NEXT:    s_endpgm
+  %gep = getelementptr half, ptr addrspace(1) %in, i64 -256
+  %val = load atomic half, ptr addrspace(1) %gep  seq_cst, align 2
+  store half %val, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; SI-LABEL: atomic_load_bf16_offset:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_mov_b32 s5, s3
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: atomic_load_bf16_offset:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s2
+; VI-NEXT:    s_mov_b32 s5, s3
+; VI-NEXT:    s_mov_b32 s2, s6
+; VI-NEXT:    s_mov_b32 s3, s7
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_load_bf16_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-NEXT:    s_endpgm
+  %gep = getelementptr bfloat, ptr addrspace(1) %in, i64 8
+  %val = load atomic bfloat, ptr addrspace(1) %gep  seq_cst, align 2
+  store bfloat %val, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; SI-LABEL: atomic_load_bf16_negoffset:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_mov_b32 s5, s3
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    v_mov_b32_e32 v0, 0xfffffe00
+; SI-NEXT:    v_mov_b32_e32 v1, -1
+; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: atomic_load_bf16_negoffset:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s0, s0, 0xfffffe00
+; VI-NEXT:    s_addc_u32 s1, s1, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_load_ushort v0, v[0:1] glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_mov_b32 s4, s2
+; VI-NEXT:    s_mov_b32 s5, s3
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_load_bf16_negoffset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX9-NEXT:    s_endpgm
+  %gep = getelementptr bfloat, ptr addrspace(1) %in, i64 -256
+  %val = load atomic bfloat, ptr addrspace(1) %gep  seq_cst, align 2
+  store bfloat %val, ptr addrspace(1) %out
+  ret void
+}