[PATCH] D30810: Preserve vec3 type.
JinGu Kang via Phabricator via cfe-commits
cfe-commits at lists.llvm.org
Thu Mar 16 05:15:48 PDT 2017
jaykang10 added a comment.
In https://reviews.llvm.org/D30810#702614, @Anastasia wrote:
> In https://reviews.llvm.org/D30810#702443, @bruno wrote:
>
> > > As a result, I think it would be good for clang to have both of features and I would like to stick to the option "-fpresereve-vec3' to change the behavior easily.
> >
> > The motivation doesn't seem solid to me, who else is going to benefit from this flag?
>
>
> There are some off the main tree implementation that would benefit. But in the case of AMD GPU 3 loads/stores will be produced instead of 4. Sounds like a good optimization to me. As I said in my previous comment I think it should have been the default behavior from the beginning, but since different implementation landed first we can integrate this one now with an additional option.
Additionally, Here is assembly output from vec3 with amdgcn target. :)
LLVM IR
define void @foo(<3 x float>* nocapture %a, <3 x float>* nocapture readonly %b) {
entry:
%0 = load <3 x float>, <3 x float>* %b, align 16
store <3 x float> %0, <3 x float>* %a, align 16
ret void
}
Assembly Output
.text
.section .AMDGPU.config
.long 47176
.long 11272256
.long 47180
.long 132
.long 47200
.long 0
.long 4
.long 0
.long 8
.long 0
.text
.globl foo
.p2align 8
.type foo, at function
foo: ; @foo
; BB#0: ; %entry
s_load_dword s2, s[0:1], 0x9
s_load_dword s0, s[0:1], 0xa
s_mov_b32 s4, SCRATCH_RSRC_DWORD0
s_mov_b32 s5, SCRATCH_RSRC_DWORD1
s_mov_b32 s6, -1
s_mov_b32 s8, s3
s_mov_b32 s7, 0xe8f000
s_waitcnt lgkmcnt(0)
v_mov_b32_e32 v0, s0
buffer_load_dword v2, v0, s[4:7], s8 offen
buffer_load_dword v3, v0, s[4:7], s8 offen offset:8
buffer_load_dword v0, v0, s[4:7], s8 offen offset:4
v_mov_b32_e32 v1, s2
s_waitcnt vmcnt(0)
buffer_store_dword v0, v1, s[4:7], s8 offen offset:4
buffer_store_dword v2, v1, s[4:7], s8 offen
buffer_store_dword v3, v1, s[4:7], s8 offen offset:8
s_endpgm
.Lfunc_end0:
.size foo, .Lfunc_end0-foo
.section .AMDGPU.csdata
; Kernel info:
; codeLenInByte = 112
; NumSgprs: 9
; NumVgprs: 4
; FloatMode: 192
; IeeeMode: 1
; ScratchSize: 0
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
; VGPRBlocks: 0
; NumSGPRsForWavesPerEU: 9
; NumVGPRsForWavesPerEU: 4
; ReservedVGPRFirst: 0
; ReservedVGPRCount: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 2
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
.section ".note.GNU-stack"
https://reviews.llvm.org/D30810
More information about the cfe-commits
mailing list