[PATCH] D106447: [DAGCombine] DAGTypeLegalizer::GenWidenVectorLoads(): make use of dereferenceability knowledge
Jay Foad via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 21 09:25:58 PDT 2021
foad added inline comments.
================
Comment at: llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll:165
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x11
+; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
----------------
lebedev.ri wrote:
> foad wrote:
> > This looks alarming. It's loading 32 bytes instead of 8.
> Could you please be more specific, do you believe this is a correctness concern, or a performance one?
> Because i do believe this is correct:
> ```
> Legalizing node: t67: v10i32,ch = load<(dereferenceable invariant load (s320) from %ir.1, align 4, addrspace 4)> t0, t28, undef:i64
> Analyzing result type: v10i32
> Widen node result 0: t67: v10i32,ch = load<(dereferenceable invariant load (s320) from %ir.1, align 4, addrspace 4)> t0, t28, undef:i64
>
> NumDereferenceableBytes 40
> Creating new node: t86: v8i32,ch = load<(dereferenceable invariant load (s256) from %ir.1, align 4, addrspace 4)> t0, t28, undef:i64
> Creating constant: t87: i64 = Constant<32>
> Creating new node: t88: i64 = add nuw t28, Constant:i64<32>
> Creating new node: t89: v8i32,ch = load<(dereferenceable invariant load (s256) from %ir.1 + 32, align 4, addrspace 4)> t0, t88, undef:i64
> Creating new node: t90: v16i32 = concat_vectors t86, t89
> Creating new node: t91: ch = TokenFactor t86:1, t89:1
> ```
> ```
> *** IR Dump Before Module Verifier (verify) *** (function: test_copy_v4i8_x4)
> ; ModuleID = '/tmp/test.ll'
> source_filename = "/tmp/test.ll"
> target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
>
> ; Function Attrs: nounwind
> define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) #0 {
> %test_copy_v4i8_x4.kernarg.segment = call nonnull align 16 dereferenceable(76) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
> %out0.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 36
> %out0.kernarg.offset.cast = bitcast i8 addrspace(4)* %out0.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
> %1 = bitcast <4 x i8> addrspace(1)* addrspace(4)* %out0.kernarg.offset.cast to <5 x i64> addrspace(4)*, !amdgpu.uniform !0
> %2 = load <5 x i64>, <5 x i64> addrspace(4)* %1, align 4, !invariant.load !0
> %out0.load1 = extractelement <5 x i64> %2, i32 0
> %3 = inttoptr i64 %out0.load1 to <4 x i8> addrspace(1)*
> %out1.load2 = extractelement <5 x i64> %2, i32 1
> %4 = inttoptr i64 %out1.load2 to <4 x i8> addrspace(1)*
> %out2.load3 = extractelement <5 x i64> %2, i32 2
> %5 = inttoptr i64 %out2.load3 to <4 x i8> addrspace(1)*
> %out3.load4 = extractelement <5 x i64> %2, i32 3
> %6 = inttoptr i64 %out3.load4 to <4 x i8> addrspace(1)*
> %in.load5 = extractelement <5 x i64> %2, i32 4
> %7 = inttoptr i64 %in.load5 to <4 x i8> addrspace(1)*
> %out1.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 44
> %out1.kernarg.offset.cast = bitcast i8 addrspace(4)* %out1.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
> %out2.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 52
> %out2.kernarg.offset.cast = bitcast i8 addrspace(4)* %out2.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
> %out3.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 60
> %out3.kernarg.offset.cast = bitcast i8 addrspace(4)* %out3.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
> %in.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 68
> %in.kernarg.offset.cast = bitcast i8 addrspace(4)* %in.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
> %tid.x = call i32 @llvm.amdgcn.workitem.id.x(), !range !1
> %idxprom = sext i32 %tid.x to i64
> %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %7, i64 %idxprom
> %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
> store <4 x i8> %val, <4 x i8> addrspace(1)* %3, align 4
> store <4 x i8> %val, <4 x i8> addrspace(1)* %4, align 4
> store <4 x i8> %val, <4 x i8> addrspace(1)* %5, align 4
> store <4 x i8> %val, <4 x i8> addrspace(1)* %6, align 4
> ret void
> }
>
> ; Function Attrs: nounwind readnone speculatable willreturn
> declare i32 @llvm.amdgcn.workitem.id.x() #1
>
> ; Function Attrs: nounwind readnone speculatable willreturn
> declare i32 @llvm.amdgcn.workitem.id.y() #1
>
> ; Function Attrs: nounwind readnone speculatable willreturn
> declare align 4 i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2
>
> ; Function Attrs: convergent nounwind willreturn
> declare { i1, i64 } @llvm.amdgcn.if.i64(i1) #3
>
> ; Function Attrs: convergent nounwind willreturn
> declare { i1, i64 } @llvm.amdgcn.else.i64.i64(i64) #3
>
> ; Function Attrs: convergent nounwind readnone willreturn
> declare i64 @llvm.amdgcn.if.break.i64(i1, i64) #4
>
> ; Function Attrs: convergent nounwind willreturn
> declare i1 @llvm.amdgcn.loop.i64(i64) #3
>
> ; Function Attrs: convergent nounwind willreturn
> declare void @llvm.amdgcn.end.cf.i64(i64) #3
>
> attributes #0 = { nounwind "amdgpu-memory-bound"="true" "amdgpu-wave-limiter"="true" "target-cpu"="tahiti" }
> attributes #1 = { nounwind readnone speculatable willreturn "target-cpu"="tahiti" }
> attributes #2 = { nounwind readnone speculatable willreturn }
> attributes #3 = { convergent nounwind willreturn }
> attributes #4 = { convergent nounwind readnone willreturn }
>
> !0 = !{}
> !1 = !{i32 0, i32 1024}
>
> ```
I meant it looks like an alarming performance regression. I have no reason to believe it's incorrect.
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D106447/new/
https://reviews.llvm.org/D106447
More information about the llvm-commits
mailing list