[PATCH] D106447: [DAGCombine] DAGTypeLegalizer::GenWidenVectorLoads(): make use of dereferenceability knowledge

Wed Jul 21 09:12:43 PDT 2021

lebedev.ri added inline comments.

================
Comment at: llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll:165
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x11
+; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
----------------
foad wrote:
> This looks alarming. It's loading 32 bytes instead of 8.
Could you please be more specific, do you believe this is a correctness concern, or a performance one?
Because i do believe this is correct:
```
Legalizing node: t67: v10i32,ch = load<(dereferenceable invariant load (s320) from %ir.1, align 4, addrspace 4)> t0, t28, undef:i64
Analyzing result type: v10i32
Widen node result 0: t67: v10i32,ch = load<(dereferenceable invariant load (s320) from %ir.1, align 4, addrspace 4)> t0, t28, undef:i64

NumDereferenceableBytes 40
Creating new node: t86: v8i32,ch = load<(dereferenceable invariant load (s256) from %ir.1, align 4, addrspace 4)> t0, t28, undef:i64
Creating constant: t87: i64 = Constant<32>
Creating new node: t88: i64 = add nuw t28, Constant:i64<32>
Creating new node: t89: v8i32,ch = load<(dereferenceable invariant load (s256) from %ir.1 + 32, align 4, addrspace 4)> t0, t88, undef:i64
Creating new node: t90: v16i32 = concat_vectors t86, t89
Creating new node: t91: ch = TokenFactor t86:1, t89:1
```
```
*** IR Dump Before Module Verifier (verify) *** (function: test_copy_v4i8_x4)
; ModuleID = '/tmp/test.ll'
source_filename = "/tmp/test.ll"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"

; Function Attrs: nounwind
define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) #0 {
  %test_copy_v4i8_x4.kernarg.segment = call nonnull align 16 dereferenceable(76) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
  %out0.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 36
  %out0.kernarg.offset.cast = bitcast i8 addrspace(4)* %out0.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
  %1 = bitcast <4 x i8> addrspace(1)* addrspace(4)* %out0.kernarg.offset.cast to <5 x i64> addrspace(4)*, !amdgpu.uniform !0
  %2 = load <5 x i64>, <5 x i64> addrspace(4)* %1, align 4, !invariant.load !0
  %out0.load1 = extractelement <5 x i64> %2, i32 0
  %3 = inttoptr i64 %out0.load1 to <4 x i8> addrspace(1)*
  %out1.load2 = extractelement <5 x i64> %2, i32 1
  %4 = inttoptr i64 %out1.load2 to <4 x i8> addrspace(1)*
  %out2.load3 = extractelement <5 x i64> %2, i32 2
  %5 = inttoptr i64 %out2.load3 to <4 x i8> addrspace(1)*
  %out3.load4 = extractelement <5 x i64> %2, i32 3
  %6 = inttoptr i64 %out3.load4 to <4 x i8> addrspace(1)*
  %in.load5 = extractelement <5 x i64> %2, i32 4
  %7 = inttoptr i64 %in.load5 to <4 x i8> addrspace(1)*
  %out1.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 44
  %out1.kernarg.offset.cast = bitcast i8 addrspace(4)* %out1.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
  %out2.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 52
  %out2.kernarg.offset.cast = bitcast i8 addrspace(4)* %out2.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
  %out3.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 60
  %out3.kernarg.offset.cast = bitcast i8 addrspace(4)* %out3.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
  %in.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %test_copy_v4i8_x4.kernarg.segment, i64 68
  %in.kernarg.offset.cast = bitcast i8 addrspace(4)* %in.kernarg.offset to <4 x i8> addrspace(1)* addrspace(4)*
  %tid.x = call i32 @llvm.amdgcn.workitem.id.x(), !range !1
  %idxprom = sext i32 %tid.x to i64
  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %7, i64 %idxprom
  %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
  store <4 x i8> %val, <4 x i8> addrspace(1)* %3, align 4
  store <4 x i8> %val, <4 x i8> addrspace(1)* %4, align 4
  store <4 x i8> %val, <4 x i8> addrspace(1)* %5, align 4
  store <4 x i8> %val, <4 x i8> addrspace(1)* %6, align 4
  ret void
}

; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workitem.id.x() #1

; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workitem.id.y() #1

; Function Attrs: nounwind readnone speculatable willreturn
declare align 4 i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2

; Function Attrs: convergent nounwind willreturn
declare { i1, i64 } @llvm.amdgcn.if.i64(i1) #3

; Function Attrs: convergent nounwind willreturn
declare { i1, i64 } @llvm.amdgcn.else.i64.i64(i64) #3

; Function Attrs: convergent nounwind readnone willreturn
declare i64 @llvm.amdgcn.if.break.i64(i1, i64) #4

; Function Attrs: convergent nounwind willreturn
declare i1 @llvm.amdgcn.loop.i64(i64) #3

; Function Attrs: convergent nounwind willreturn
declare void @llvm.amdgcn.end.cf.i64(i64) #3

attributes #0 = { nounwind "amdgpu-memory-bound"="true" "amdgpu-wave-limiter"="true" "target-cpu"="tahiti" }
attributes #1 = { nounwind readnone speculatable willreturn "target-cpu"="tahiti" }
attributes #2 = { nounwind readnone speculatable willreturn }
attributes #3 = { convergent nounwind willreturn }
attributes #4 = { convergent nounwind readnone willreturn }

!0 = !{}
!1 = !{i32 0, i32 1024}

```

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106447/new/

https://reviews.llvm.org/D106447