[PATCH] D106447: [DAGCombine] DAGTypeLegalizer::GenWidenVectorLoads(): make use of dereferenceability knowledge

Wed Jul 21 11:38:34 PDT 2021

RKSimon added inline comments.

================
Comment at: llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll:165
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x11
+; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
----------------
lebedev.ri wrote:
> foad wrote:
> > lebedev.ri wrote:
> > > foad wrote:
> > > > This looks alarming. It's loading 32 bytes instead of 8.
> > > Could you please be more specific, do you believe this is a correctness concern, or a performance one?
> > > Because i do believe this is correct:
> > > ```
> > > Legalizing node: t67: v10i32,ch = load<(dereferenceable invariant load (s320) from %ir.1, align 4, addrspace 4)> t0, t28, undef:i64
> > > Analyzing result type: v10i32
> > > Widen node result 0: t67: v10i32,ch = load<(dereferenceable invariant load (s320) from %ir.1, align 4, addrspace 4)> t0, t28, undef:i64
> > > 
> > > NumDereferenceableBytes 40
> > > Creating new node: t86: v8i32,ch = load<(dereferenceable invariant load (s256) from %ir.1, align 4, addrspace 4)> t0, t28, undef:i64
> > > Creating constant: t87: i64 = Constant<32>
> > > Creating new node: t88: i64 = add nuw t28, Constant:i64<32>
> > > Creating new node: t89: v8i32,ch = load<(dereferenceable invariant load (s256) from %ir.1 + 32, align 4, addrspace 4)> t0, t88, undef:i64
> > > Creating new node: t90: v16i32 = concat_vectors t86, t89
> > > Creating new node: t91: ch = TokenFactor t86:1, t89:1
> > > ```
> > > ```
> > > *** IR Dump Before Module Verifier (verify) *** (function: test_copy_v4i8_x4)
> > > ; ModuleID = '/tmp/test.ll'
> > > source_filename = "/tmp/test.ll"
> > > target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
> > > 
> > > ; Function Attrs: nounwind
> > > define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) #0 {
> > >   %test_copy_v4i8_x4.kernarg.segment = call nonnull align 16 dereferenceable(76) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
> > >   %out0.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 36
> > >   %out0.kernarg.offset.cast = bitcast i8 addrspace(4)* %out0.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
> > >   %1 = bitcast <4 x i8> addrspace(1)* addrspace(4)* %out0.kernarg.offset.cast to <5 x i64> addrspace(4)*, !amdgpu.uniform !0
> > >   %2 = load <5 x i64>, <5 x i64> addrspace(4)* %1, align 4, !invariant.load !0
> > >   %out0.load1 = extractelement <5 x i64> %2, i32 0
> > >   %3 = inttoptr i64 %out0.load1 to <4 x i8> addrspace(1)*
> > >   %out1.load2 = extractelement <5 x i64> %2, i32 1
> > >   %4 = inttoptr i64 %out1.load2 to <4 x i8> addrspace(1)*
> > >   %out2.load3 = extractelement <5 x i64> %2, i32 2
> > >   %5 = inttoptr i64 %out2.load3 to <4 x i8> addrspace(1)*
> > >   %out3.load4 = extractelement <5 x i64> %2, i32 3
> > >   %6 = inttoptr i64 %out3.load4 to <4 x i8> addrspace(1)*
> > >   %in.load5 = extractelement <5 x i64> %2, i32 4
> > >   %7 = inttoptr i64 %in.load5 to <4 x i8> addrspace(1)*
> > >   %out1.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 44
> > >   %out1.kernarg.offset.cast = bitcast i8 addrspace(4)* %out1.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
> > >   %out2.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 52
> > >   %out2.kernarg.offset.cast = bitcast i8 addrspace(4)* %out2.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
> > >   %out3.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 60
> > >   %out3.kernarg.offset.cast = bitcast i8 addrspace(4)* %out3.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
> > >   %in.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 68
> > >   %in.kernarg.offset.cast = bitcast i8 addrspace(4)* %in.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
> > >   %tid.x = call i32 @llvm.amdgcn.workitem.id.x(), !range !1
> > >   %idxprom = sext i32 %tid.x to i64
> > >   %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %7, i64 %idxprom
> > >   %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
> > >   store <4 x i8> %val, <4 x i8> addrspace(1)* %3, align 4
> > >   store <4 x i8> %val, <4 x i8> addrspace(1)* %4, align 4
> > >   store <4 x i8> %val, <4 x i8> addrspace(1)* %5, align 4
> > >   store <4 x i8> %val, <4 x i8> addrspace(1)* %6, align 4
> > >   ret void
> > > }
> > > 
> > > ; Function Attrs: nounwind readnone speculatable willreturn
> > > declare i32 @llvm.amdgcn.workitem.id.x() #1
> > > 
> > > ; Function Attrs: nounwind readnone speculatable willreturn
> > > declare i32 @llvm.amdgcn.workitem.id.y() #1
> > > 
> > > ; Function Attrs: nounwind readnone speculatable willreturn
> > > declare align 4 i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2
> > > 
> > > ; Function Attrs: convergent nounwind willreturn
> > > declare { i1, i64 } @llvm.amdgcn.if.i64(i1) #3
> > > 
> > > ; Function Attrs: convergent nounwind willreturn
> > > declare { i1, i64 } @llvm.amdgcn.else.i64.i64(i64) #3
> > > 
> > > ; Function Attrs: convergent nounwind readnone willreturn
> > > declare i64 @llvm.amdgcn.if.break.i64(i1, i64) #4
> > > 
> > > ; Function Attrs: convergent nounwind willreturn
> > > declare i1 @llvm.amdgcn.loop.i64(i64) #3
> > > 
> > > ; Function Attrs: convergent nounwind willreturn
> > > declare void @llvm.amdgcn.end.cf.i64(i64) #3
> > > 
> > > attributes #0 = { nounwind "amdgpu-memory-bound"="true" "amdgpu-wave-limiter"="true" "target-cpu"="tahiti" }
> > > attributes #1 = { nounwind readnone speculatable willreturn "target-cpu"="tahiti" }
> > > attributes #2 = { nounwind readnone speculatable willreturn }
> > > attributes #3 = { convergent nounwind willreturn }
> > > attributes #4 = { convergent nounwind readnone willreturn }
> > > 
> > > !0 = !{}
> > > !1 = !{i32 0, i32 1024}
> > > 
> > > ```
> > I meant it looks like an alarming performance regression. I have no reason to believe it's incorrect.
> I'm open to suggestions.
The problem you're going to hit is FindMemType searches for the biggest possible valid type - and the dereferencable change has relaxed that validity check even more - you're probably going to have to get FindMemType to continue searching the smaller types until it fails a second validity test?

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106447/new/

https://reviews.llvm.org/D106447