[llvm-branch-commits] [llvm] [AMDGPU][SILoadStoreOptimizer] Merge constrained sloads (PR #96162)
Christudasan Devadasan via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jul 3 09:15:14 PDT 2024
================
@@ -183,10 +183,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
; GFX940-LABEL: local_atomic_fadd_v2bf16_noret:
; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
----------------
cdevadas wrote:
Unfortunately, that's not happening. The IR load-store-vectorizer doesn't combine the two loads.
I still see the two loads after the IR vectorizer and they become two loads in the selected code. Can this happen because the alignment for the two loads differ and the IR vectorizer safely ignores them?
*** IR Dump before Selection ***
define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) #0 {
%local_atomic_fadd_v2bf16_noret.kernarg.segment = call nonnull align 16 dereferenceable(44) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
%ptr.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %local_atomic_fadd_v2bf16_noret.kernarg.segment, i64 36, !amdgpu.uniform !0
**%ptr.load = load ptr addrspace(3), ptr addrspace(4) %ptr.kernarg.offset**, align 4, !invariant.load !0
%data.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %local_atomic_fadd_v2bf16_noret.kernarg.segment, i64 40, !amdgpu.uniform !0
**%data.load = load <2 x i16>, ptr addrspace(4) %data.kernarg.offset**, align 8, !invariant.load !0
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr.load, <2 x i16> %data.load)
ret void
}
# *** IR Dump After selection ***:
# Machine code for function local_atomic_fadd_v2bf16_noret: IsSSA, TracksLiveness
Function Live Ins: $sgpr0_sgpr1 in %1
bb.0 (%ir-block.0):
liveins: $sgpr0_sgpr1
%1:sgpr_64(p4) = COPY $sgpr0_sgpr1
%3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1:sgpr_64(p4), 36, 0 :: (dereferenceable invariant load (s32) from %ir.ptr.kernarg.offset, addrspace 4)
%4:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1:sgpr_64(p4), 40, 0 :: (dereferenceable invariant load (s32) from %ir.data.kernarg.offset, align 8, addrspace 4)
%5:vgpr_32 = COPY %3:sreg_32_xm0_xexec
%6:vgpr_32 = COPY %4:sreg_32_xm0_xexec
DS_PK_ADD_BF16 killed %5:vgpr_32, killed %6:vgpr_32, 0, 0, implicit $m0, implicit $exec
S_ENDPGM 0
https://github.com/llvm/llvm-project/pull/96162
More information about the llvm-branch-commits
mailing list