[PATCH] D30810: Preserve vec3 type.

Thu Mar 16 05:15:48 PDT 2017

jaykang10 added a comment.

In https://reviews.llvm.org/D30810#702614, @Anastasia wrote:

> In https://reviews.llvm.org/D30810#702443, @bruno wrote:
>
> > > As a result, I think it would be good for clang to have both of features and I would like to stick to the option "-fpresereve-vec3' to change the behavior easily.
> >
> > The motivation doesn't seem solid to me, who else is going to benefit from this flag?
>
>
> There are some off the main tree implementation that would benefit. But in the case of AMD GPU 3 loads/stores will be produced instead of 4. Sounds like a good optimization to me. As I said in my previous comment I think it should have been the default behavior from the beginning, but since different implementation landed first we can integrate this one now with an additional option.

Additionally, Here is assembly output from vec3 with amdgcn target. :)

LLVM IR

  define void @foo(<3 x float>* nocapture %a, <3 x float>* nocapture readonly %b) {
  entry:
    %0 = load <3 x float>, <3 x float>* %b, align 16
    store <3 x float> %0, <3 x float>* %a, align 16
    ret void
  }

Assembly Output

  	.text
  	.section	.AMDGPU.config
  	.long	47176
  	.long	11272256
  	.long	47180
  	.long	132
  	.long	47200
  	.long	0
  	.long	4
  	.long	0
  	.long	8
  	.long	0
  	.text
  	.globl	foo
  	.p2align	8
  	.type	foo, at function
  foo:                                    ; @foo
  ; BB#0:                                 ; %entry
  	s_load_dword s2, s[0:1], 0x9
  	s_load_dword s0, s[0:1], 0xa
  	s_mov_b32 s4, SCRATCH_RSRC_DWORD0
  	s_mov_b32 s5, SCRATCH_RSRC_DWORD1
  	s_mov_b32 s6, -1
  	s_mov_b32 s8, s3
  	s_mov_b32 s7, 0xe8f000
  	s_waitcnt lgkmcnt(0)
  	v_mov_b32_e32 v0, s0
  	buffer_load_dword v2, v0, s[4:7], s8 offen
  	buffer_load_dword v3, v0, s[4:7], s8 offen offset:8
  	buffer_load_dword v0, v0, s[4:7], s8 offen offset:4
  	v_mov_b32_e32 v1, s2
  	s_waitcnt vmcnt(0)
  	buffer_store_dword v0, v1, s[4:7], s8 offen offset:4
  	buffer_store_dword v2, v1, s[4:7], s8 offen
  	buffer_store_dword v3, v1, s[4:7], s8 offen offset:8
  	s_endpgm
  .Lfunc_end0:
  	.size	foo, .Lfunc_end0-foo

  	.section	.AMDGPU.csdata
  ; Kernel info:
  ; codeLenInByte = 112
  ; NumSgprs: 9
  ; NumVgprs: 4
  ; FloatMode: 192
  ; IeeeMode: 1
  ; ScratchSize: 0
  ; LDSByteSize: 0 bytes/workgroup (compile time only)
  ; SGPRBlocks: 1
  ; VGPRBlocks: 0
  ; NumSGPRsForWavesPerEU: 9
  ; NumVGPRsForWavesPerEU: 4
  ; ReservedVGPRFirst: 0
  ; ReservedVGPRCount: 0
  ; COMPUTE_PGM_RSRC2:USER_SGPR: 2
  ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
  ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
  ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
  ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
  ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0

  	.section	".note.GNU-stack"

https://reviews.llvm.org/D30810