[PATCH] D106447: [DAGCombine] DAGTypeLegalizer::GenWidenVectorLoads(): make use of dereferenceability knowledge

Wed Jul 21 09:25:58 PDT 2021

foad added inline comments.

================
Comment at: llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll:165
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x11
+; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
----------------
lebedev.ri wrote:
> foad wrote:
> > This looks alarming. It's loading 32 bytes instead of 8.
> Could you please be more specific, do you believe this is a correctness concern, or a performance one?
> Because i do believe this is correct:
> ```
> Legalizing node: t67: v10i32,ch = load<(dereferenceable invariant load (s320) from %ir.1, align 4, addrspace 4)> t0, t28, undef:i64
> Analyzing result type: v10i32
> Widen node result 0: t67: v10i32,ch = load<(dereferenceable invariant load (s320) from %ir.1, align 4, addrspace 4)> t0, t28, undef:i64
> 
> NumDereferenceableBytes 40
> Creating new node: t86: v8i32,ch = load<(dereferenceable invariant load (s256) from %ir.1, align 4, addrspace 4)> t0, t28, undef:i64
> Creating constant: t87: i64 = Constant<32>
> Creating new node: t88: i64 = add nuw t28, Constant:i64<32>
> Creating new node: t89: v8i32,ch = load<(dereferenceable invariant load (s256) from %ir.1 + 32, align 4, addrspace 4)> t0, t88, undef:i64
> Creating new node: t90: v16i32 = concat_vectors t86, t89
> Creating new node: t91: ch = TokenFactor t86:1, t89:1
> ```
> ```
> *** IR Dump Before Module Verifier (verify) *** (function: test_copy_v4i8_x4)
> ; ModuleID = '/tmp/test.ll'
> source_filename = "/tmp/test.ll"
> target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
> 
> ; Function Attrs: nounwind
> define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) #0 {
>   %test_copy_v4i8_x4.kernarg.segment = call nonnull align 16 dereferenceable(76) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
>   %out0.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 36
>   %out0.kernarg.offset.cast = bitcast i8 addrspace(4)* %out0.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
>   %1 = bitcast <4 x i8> addrspace(1)* addrspace(4)* %out0.kernarg.offset.cast to <5 x i64> addrspace(4)*, !amdgpu.uniform !0
>   %2 = load <5 x i64>, <5 x i64> addrspace(4)* %1, align 4, !invariant.load !0
>   %out0.load1 = extractelement <5 x i64> %2, i32 0
>   %3 = inttoptr i64 %out0.load1 to <4 x i8> addrspace(1)*
>   %out1.load2 = extractelement <5 x i64> %2, i32 1
>   %4 = inttoptr i64 %out1.load2 to <4 x i8> addrspace(1)*
>   %out2.load3 = extractelement <5 x i64> %2, i32 2
>   %5 = inttoptr i64 %out2.load3 to <4 x i8> addrspace(1)*
>   %out3.load4 = extractelement <5 x i64> %2, i32 3
>   %6 = inttoptr i64 %out3.load4 to <4 x i8> addrspace(1)*
>   %in.load5 = extractelement <5 x i64> %2, i32 4
>   %7 = inttoptr i64 %in.load5 to <4 x i8> addrspace(1)*
>   %out1.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 44
>   %out1.kernarg.offset.cast = bitcast i8 addrspace(4)* %out1.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
>   %out2.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 52
>   %out2.kernarg.offset.cast = bitcast i8 addrspace(4)* %out2.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
>   %out3.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 60
>   %out3.kernarg.offset.cast = bitcast i8 addrspace(4)* %out3.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
>   %in.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 68
>   %in.kernarg.offset.cast = bitcast i8 addrspace(4)* %in.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
>   %tid.x = call i32 @llvm.amdgcn.workitem.id.x(), !range !1
>   %idxprom = sext i32 %tid.x to i64
>   %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %7, i64 %idxprom
>   %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
>   store <4 x i8> %val, <4 x i8> addrspace(1)* %3, align 4
>   store <4 x i8> %val, <4 x i8> addrspace(1)* %4, align 4
>   store <4 x i8> %val, <4 x i8> addrspace(1)* %5, align 4
>   store <4 x i8> %val, <4 x i8> addrspace(1)* %6, align 4
>   ret void
> }
> 
> ; Function Attrs: nounwind readnone speculatable willreturn
> declare i32 @llvm.amdgcn.workitem.id.x() #1
> 
> ; Function Attrs: nounwind readnone speculatable willreturn
> declare i32 @llvm.amdgcn.workitem.id.y() #1
> 
> ; Function Attrs: nounwind readnone speculatable willreturn
> declare align 4 i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2
> 
> ; Function Attrs: convergent nounwind willreturn
> declare { i1, i64 } @llvm.amdgcn.if.i64(i1) #3
> 
> ; Function Attrs: convergent nounwind willreturn
> declare { i1, i64 } @llvm.amdgcn.else.i64.i64(i64) #3
> 
> ; Function Attrs: convergent nounwind readnone willreturn
> declare i64 @llvm.amdgcn.if.break.i64(i1, i64) #4
> 
> ; Function Attrs: convergent nounwind willreturn
> declare i1 @llvm.amdgcn.loop.i64(i64) #3
> 
> ; Function Attrs: convergent nounwind willreturn
> declare void @llvm.amdgcn.end.cf.i64(i64) #3
> 
> attributes #0 = { nounwind "amdgpu-memory-bound"="true" "amdgpu-wave-limiter"="true" "target-cpu"="tahiti" }
> attributes #1 = { nounwind readnone speculatable willreturn "target-cpu"="tahiti" }
> attributes #2 = { nounwind readnone speculatable willreturn }
> attributes #3 = { convergent nounwind willreturn }
> attributes #4 = { convergent nounwind readnone willreturn }
> 
> !0 = !{}
> !1 = !{i32 0, i32 1024}
> 
> ```
I meant it looks like an alarming performance regression. I have no reason to believe it's incorrect.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106447/new/

https://reviews.llvm.org/D106447