[llvm] [AMDGPU] Sink uniform buffer address offsets into soffset (PR #169230)
Shilei Tian via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 23 16:48:14 PST 2025
================
@@ -0,0 +1,461 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=CHECK %s
+
+; Test comprehensive patterns for ADD(divergent, uniform) optimization in buffer stores
+
+; Basic workitem.id.x + uniform for store
+define amdgpu_kernel void @test_basic_workitem_uniform_store(ptr addrspace(1) %input, i32 %soffset) {
+; CHECK-LABEL: test_basic_workitem_uniform_store:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v1, v1, s[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], s4 offen
+; CHECK-NEXT: s_endpgm
+ %desc = call <4 x i32> asm "", "=s"()
+ %voffset = call i32 @llvm.amdgcn.workitem.id.x()
+ %sum = add i32 %voffset, %soffset
+ %val = load i32, ptr addrspace(1) %input
+ call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0)
+ ret void
+}
+
+; Reversed operands (uniform + divergent) for store
+define amdgpu_kernel void @test_reversed_operands_store(ptr addrspace(1) %input, i32 %soffset) {
+; CHECK-LABEL: test_reversed_operands_store:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v1, v1, s[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], s4 offen
+; CHECK-NEXT: s_endpgm
+ %desc = call <4 x i32> asm "", "=s"()
+ %voffset = call i32 @llvm.amdgcn.workitem.id.x()
+ %sum = add i32 %soffset, %voffset ; Reversed: uniform + divergent
+ %val = load i32, ptr addrspace(1) %input
+ call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0)
+ ret void
+}
+
+; Multiple buffer stores with same pattern
+define amdgpu_kernel void @test_multiple_stores(ptr addrspace(1) %input, i32 %soffset1, i32 %soffset2) {
+; CHECK-LABEL: test_multiple_stores:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v1, v1, s[0:1]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[4:7], s2 offen
+; CHECK-NEXT: v_add_u32_e32 v1, 10, v1
+; CHECK-NEXT: buffer_store_dword v1, v0, s[4:7], s3 offen
+; CHECK-NEXT: s_endpgm
+ %desc = call <4 x i32> asm "", "=s"()
+ %voffset = call i32 @llvm.amdgcn.workitem.id.x()
+ %val = load i32, ptr addrspace(1) %input
+
+ %sum1 = add i32 %voffset, %soffset1
+ call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum1, i32 0, i32 0)
+
+ %sum2 = add i32 %voffset, %soffset2
+ %val2 = add i32 %val, 10
+ call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val2, <4 x i32> %desc, i32 %sum2, i32 0, i32 0)
+
+ ret void
+}
+
+; Different buffer store variants - byte store
+define amdgpu_kernel void @test_buffer_store_byte(ptr addrspace(1) %input, i32 %soffset) {
+; CHECK-LABEL: test_buffer_store_byte:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v1, v1, s[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], s4 offen
+; CHECK-NEXT: s_endpgm
+ %desc = call <4 x i32> asm "", "=s"()
+ %voffset = call i32 @llvm.amdgcn.workitem.id.x()
+ %sum = add i32 %voffset, %soffset
+ %val = load i32, ptr addrspace(1) %input
+ %trunc = trunc i32 %val to i8
+ call void @llvm.amdgcn.raw.buffer.store.i8(i8 %trunc, <4 x i32> %desc, i32 %sum, i32 0, i32 0)
+ ret void
+}
+
+; Different buffer store variants - short store
+define amdgpu_kernel void @test_buffer_store_short(ptr addrspace(1) %input, i32 %soffset) {
+; CHECK-LABEL: test_buffer_store_short:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v1, v1, s[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], s4 offen
+; CHECK-NEXT: s_endpgm
+ %desc = call <4 x i32> asm "", "=s"()
+ %voffset = call i32 @llvm.amdgcn.workitem.id.x()
+ %sum = add i32 %voffset, %soffset
+ %val = load i32, ptr addrspace(1) %input
+ %trunc = trunc i32 %val to i16
+ call void @llvm.amdgcn.raw.buffer.store.i16(i16 %trunc, <4 x i32> %desc, i32 %sum, i32 0, i32 0)
+ ret void
+}
+
+; Vector stores - v2i32
+define amdgpu_kernel void @test_buffer_store_v2i32(ptr addrspace(1) %input, i32 %soffset) {
+; CHECK-LABEL: test_buffer_store_v2i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dwordx2 v[1:2], v0, s[0:3], s4 offen
+; CHECK-NEXT: s_endpgm
+ %desc = call <4 x i32> asm "", "=s"()
+ %voffset = call i32 @llvm.amdgcn.workitem.id.x()
+ %sum = add i32 %voffset, %soffset
+ %val = load <2 x i32>, ptr addrspace(1) %input
+ call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0)
+ ret void
+}
+
+; Vector stores - v4i32
+define amdgpu_kernel void @test_buffer_store_v4i32(ptr addrspace(1) %input, i32 %soffset) {
+; CHECK-LABEL: test_buffer_store_v4i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v1, s[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], s4 offen
+; CHECK-NEXT: s_endpgm
+ %desc = call <4 x i32> asm "", "=s"()
+ %voffset = call i32 @llvm.amdgcn.workitem.id.x()
+ %sum = add i32 %voffset, %soffset
+ %val = load <4 x i32>, ptr addrspace(1) %input
+ call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0)
+ ret void
+}
+
+; Float stores
+define amdgpu_kernel void @test_buffer_store_float(ptr addrspace(1) %input, i32 %soffset) {
+; CHECK-LABEL: test_buffer_store_float:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v1, v1, s[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], s4 offen
+; CHECK-NEXT: s_endpgm
+ %desc = call <4 x i32> asm "", "=s"()
+ %voffset = call i32 @llvm.amdgcn.workitem.id.x()
+ %sum = add i32 %voffset, %soffset
+ %val = load float, ptr addrspace(1) %input
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0)
+ ret void
+}
+
+; Complex divergent expression + uniform for store
+define amdgpu_kernel void @test_complex_divergent_store(ptr addrspace(1) %input, i32 %soffset) {
+; CHECK-LABEL: test_complex_divergent_store:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c
+; CHECK-NEXT: v_add_u32_e32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v2, v2, s[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], s4 offen
+; CHECK-NEXT: s_endpgm
+ %desc = call <4 x i32> asm "", "=s"()
+ %tid_x = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid_y = call i32 @llvm.amdgcn.workitem.id.y()
+ %divergent = add i32 %tid_x, %tid_y ; Still divergent
+ %sum = add i32 %divergent, %soffset ; divergent + uniform
+ %val = load i32, ptr addrspace(1) %input
+ call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0)
+ ret void
+}
+
+; Should NOT optimize - both operands divergent
+define amdgpu_kernel void @test_both_divergent_store(ptr addrspace(1) %input) {
+; CHECK-LABEL: test_both_divergent_store:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: v_add_u32_e32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v2, v2, s[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_endpgm
+ %desc = call <4 x i32> asm "", "=s"()
+ %tid_x = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid_y = call i32 @llvm.amdgcn.workitem.id.y()
+ %sum = add i32 %tid_x, %tid_y
+ %val = load i32, ptr addrspace(1) %input
+ call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0)
+ ret void
+}
+
+; Should NOT optimize - both operands uniform
+define amdgpu_kernel void @test_both_uniform_store(ptr addrspace(1) %input, i32 %soffset1, i32 %soffset2) {
+; CHECK-LABEL: test_both_uniform_store:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v0, v0, s[0:1]
+; CHECK-NEXT: s_add_i32 s0, s2, s3
+; CHECK-NEXT: v_mov_b32_e32 v1, s0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; CHECK-NEXT: s_endpgm
+ %desc = call <4 x i32> asm "", "=s"()
+ %sum = add i32 %soffset1, %soffset2
+ %val = load i32, ptr addrspace(1) %input
+ call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0)
+ ret void
+}
+
+; Nested in control flow
+define amdgpu_kernel void @test_control_flow_store(ptr addrspace(1) %input, i32 %soffset, i32 %condition) {
+; CHECK-LABEL: test_control_flow_store:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v1, v1, s[0:1]
+; CHECK-NEXT: s_cmp_lg_u32 s3, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB11_4
+; CHECK-NEXT: ; %bb.1: ; %else
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen
+; CHECK-NEXT: s_cbranch_execnz .LBB11_3
+; CHECK-NEXT: .LBB11_2: ; %then
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[4:7], s2 offen
+; CHECK-NEXT: .LBB11_3: ; %end
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: .LBB11_4:
+; CHECK-NEXT: s_branch .LBB11_2
+ %desc = call <4 x i32> asm "", "=s"()
+ %voffset = call i32 @llvm.amdgcn.workitem.id.x()
+ %val = load i32, ptr addrspace(1) %input
+ %cmp = icmp eq i32 %condition, 0
+ br i1 %cmp, label %then, label %else
+
+then:
+ %sum = add i32 %voffset, %soffset
+ call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0)
+ br label %end
+
+else:
+ call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %voffset, i32 0, i32 0)
+ br label %end
+
+end:
+ ret void
+}
+
+; Multiple uses of the ADD result - should still optimize buffer store
+define amdgpu_kernel void @test_multiple_uses_store(ptr addrspace(1) %input, ptr addrspace(1) %output, i32 %soffset) {
+; CHECK-LABEL: test_multiple_uses_store:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; CHECK-NEXT: s_load_dword s8, s[4:5], 0x34
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v2, v1, s[0:1]
+; CHECK-NEXT: v_add_u32_e32 v3, s8, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v2, v0, s[4:7], s8 offen
+; CHECK-NEXT: global_store_dword v1, v3, s[2:3]
+; CHECK-NEXT: s_endpgm
+ %desc = call <4 x i32> asm "", "=s"()
+ %voffset = call i32 @llvm.amdgcn.workitem.id.x()
+ %sum = add i32 %voffset, %soffset
+ %val = load i32, ptr addrspace(1) %input
+ call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0)
+ store i32 %sum, ptr addrspace(1) %output
+ ret void
+}
+
+; Chain of operations - workitem.id -> mul -> add -> buffer_store
+define amdgpu_kernel void @test_operation_chain_store(ptr addrspace(1) %input, i32 %soffset) {
+; CHECK-LABEL: test_operation_chain_store:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c
+; CHECK-NEXT: v_mul_u32_u24_e32 v0, 4, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v1, v1, s[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], s4 offen
+; CHECK-NEXT: s_endpgm
+ %desc = call <4 x i32> asm "", "=s"()
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %scaled = mul i32 %tid, 4 ; Still divergent
+ %sum = add i32 %scaled, %soffset ; divergent + uniform
+ %val = load i32, ptr addrspace(1) %input
+ call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0)
+ ret void
+}
+
+; Should NOT optimize - Buffer store with non-zero soffset field already
+define amdgpu_kernel void @test_existing_soffset_store(ptr addrspace(1) %input, i32 %soffset) {
+; CHECK-LABEL: test_existing_soffset_store:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v1, v1, s[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_add_u32_e32 v0, s4, v0
+; CHECK-NEXT: s_movk_i32 s4, 0x64
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], s4 offen
+; CHECK-NEXT: s_endpgm
+ %desc = call <4 x i32> asm "", "=s"()
+ %voffset = call i32 @llvm.amdgcn.workitem.id.x()
+ %sum = add i32 %voffset, %soffset
+ %val = load i32, ptr addrspace(1) %input
+ call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 100, i32 0) ; Non-zero soffset
+ ret void
+}
+
+; Should NOT optimize - Structured buffer stores
+define amdgpu_kernel void @test_struct_buffer_store(ptr addrspace(1) %input, i32 %soffset) {
+; CHECK-LABEL: test_struct_buffer_store:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v1, v1, s[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_add_u32_e32 v0, s4, v0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
+; CHECK-NEXT: s_endpgm
+ %desc = call <4 x i32> asm "", "=s"()
+ %voffset = call i32 @llvm.amdgcn.workitem.id.x()
+ %sum = add i32 %voffset, %soffset
+ %val = load i32, ptr addrspace(1) %input
+ call void @llvm.amdgcn.struct.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0, i32 0)
+ ret void
+}
+
+; Should NOT optimize - small positive constant fits in immediate offset field
+define amdgpu_kernel void @test_small_positive_constant_store(ptr addrspace(1) %input) {
+; CHECK-LABEL: test_small_positive_constant_store:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v1, v1, s[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100
+; CHECK-NEXT: s_endpgm
+ %desc = call <4 x i32> asm "", "=s"()
+ %voffset = call i32 @llvm.amdgcn.workitem.id.x()
+ %sum = add i32 %voffset, 100
+ %val = load i32, ptr addrspace(1) %input
+ call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0)
+ ret void
+}
+
+; Should optimize - negative constant must use soffset
+define amdgpu_kernel void @test_negative_constant_store(ptr addrspace(1) %input) {
+; CHECK-LABEL: test_negative_constant_store:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v1, v1, s[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], -16 offen
+; CHECK-NEXT: s_endpgm
+ %desc = call <4 x i32> asm "", "=s"()
+ %voffset = call i32 @llvm.amdgcn.workitem.id.x()
+ %sum = add i32 %voffset, -16
+ %val = load i32, ptr addrspace(1) %input
+ call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0)
+ ret void
+}
+
+; Should optimize - large constant doesn't fit in immediate offset field
+define amdgpu_kernel void @test_large_constant_store(ptr addrspace(1) %input) {
+; CHECK-LABEL: test_large_constant_store:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_movk_i32 s4, 0x1388
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v1, v1, s[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], s4 offen
+; CHECK-NEXT: s_endpgm
+ %desc = call <4 x i32> asm "", "=s"()
+ %voffset = call i32 @llvm.amdgcn.workitem.id.x()
+ %sum = add i32 %voffset, 5000
+ %val = load i32, ptr addrspace(1) %input
+ call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0)
+ ret void
+}
----------------
shiltian wrote:
missing an empty line at EoF
https://github.com/llvm/llvm-project/pull/169230
More information about the llvm-commits
mailing list