[PATCH] D30810: Preserve vec3 type.
JinGu Kang via Phabricator via cfe-commits
cfe-commits at lists.llvm.org
Tue Mar 21 17:47:34 PDT 2017
jaykang10 added a comment.
In https://reviews.llvm.org/D30810#706677, @Anastasia wrote:
> In https://reviews.llvm.org/D30810#706676, @Anastasia wrote:
>
> > In https://reviews.llvm.org/D30810#699428, @Anastasia wrote:
> >
> > > Would you be able to update ScalarExprEmitter::VisitAsTypeExpr implementation accordingly to make things consistent?
> >
> >
> > Not sure it got lost somewhere... do you think you could address this too?
>
>
> I guess due to 4 element alignment the generated casts as vec4 should be fine for vec3, but it is worth checking...
Let's look at examples.
Source code
void kernel float4_to_float3(global float3 *a, global float4 *b) {
//float4 --> float3
*a = __builtin_astype(*b, float3);
}
void kernel float3_to_float4(global float3 *a, global float4 *b) {
//float3 --> float4
*b = __builtin_astype(*a, float4);
}
LLVM IR
; Function Attrs: norecurse nounwind
define void @float4_to_float3(<3 x float>* nocapture %a, <4 x float>* nocapture readonly %b) {
entry:
%0 = load <4 x float>, <4 x float>* %b, align 16, !tbaa !6
%storetmp = bitcast <3 x float>* %a to <4 x float>*
store <4 x float> %0, <4 x float>* %storetmp, align 16, !tbaa !6
ret void
}
; Function Attrs: norecurse nounwind
define void @float3_to_float4(<3 x float>* nocapture readonly %a, <4 x float>* nocapture %b) {
entry:
%castToVec4 = bitcast <3 x float>* %a to <4 x float>*
%loadVec4 = load <4 x float>, <4 x float>* %castToVec4, align 16
store <4 x float> %loadVec4, <4 x float>* %b, align 16, !tbaa !6
ret void
}
We could change above IR with vec3 as following:
; Function Attrs: norecurse nounwinddefine void @float4_to_float3(<3 x float>* nocapture %a, <4 x float>* nocapture readonly %b) {
entry:
%0 = load <4 x float>, <4 x float>* %b, align 16
%1 = shufflevector <4 x float> %0, <4 x float> undef, <3 x i32><i32 0, i32 1, i32 2>
store <3 x float> %1, <3 x float>* %a, align 16
ret void
}
; Function Attrs: norecurse nounwinddefine void @float3_to_float4(<3 x float>* nocapture readonly %a, <4 x float>* nocapture %b){
entry:
%0 = load <3 x float>, <3 x float>* %a, align 16 %1 = shufflevector <3 x float> %0, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
store <4 x float> %1, <4 x float>* %b, align 16
ret void
}
I guess it is what you want on LLVM IR. I have compiled above IRs with amdgcn target.
vec4 float4_to_float3 assembly code
float4_to_float3: ; @float4_to_float3
; BB#0: ; %entry
s_load_dword s2, s[0:1], 0x9
s_load_dword s0, s[0:1], 0xa
s_mov_b32 s4, SCRATCH_RSRC_DWORD0
s_mov_b32 s5, SCRATCH_RSRC_DWORD1
s_mov_b32 s6, -1
s_mov_b32 s8, s3
s_mov_b32 s7, 0xe8f000
s_waitcnt lgkmcnt(0)
v_mov_b32_e32 v0, s0
buffer_load_dword v2, v0, s[4:7], s8 offen
buffer_load_dword v3, v0, s[4:7], s8 offen offset:4
buffer_load_dword v4, v0, s[4:7], s8 offen offset:8
buffer_load_dword v0, v0, s[4:7], s8 offen offset:12
v_mov_b32_e32 v1, s2
s_waitcnt vmcnt(0)
buffer_store_dword v0, v1, s[4:7], s8 offen offset:12
buffer_store_dword v4, v1, s[4:7], s8 offen offset:8
buffer_store_dword v3, v1, s[4:7], s8 offen offset:4
buffer_store_dword v2, v1, s[4:7], s8 offen
s_endpgm
vec3 float4_to_float3 assembly code
float4_to_float3: ; @float4_to_float3
; BB#0: ; %entry
s_load_dword s2, s[0:1], 0x9
s_load_dword s0, s[0:1], 0xa
s_mov_b32 s4, SCRATCH_RSRC_DWORD0
s_mov_b32 s5, SCRATCH_RSRC_DWORD1
s_mov_b32 s6, -1
s_mov_b32 s8, s3
s_mov_b32 s7, 0xe8f000
s_waitcnt lgkmcnt(0)
v_mov_b32_e32 v0, s0
buffer_load_dword v2, v0, s[4:7], s8 offen
buffer_load_dword v3, v0, s[4:7], s8 offen offset:8
buffer_load_dword v0, v0, s[4:7], s8 offen offset:4
v_mov_b32_e32 v1, s2
s_waitcnt vmcnt(0)
buffer_store_dword v0, v1, s[4:7], s8 offen offset:4
buffer_store_dword v2, v1, s[4:7], s8 offen
buffer_store_dword v3, v1, s[4:7], s8 offen offset:8
s_endpgm
I think it is what we expect because there are 3 load/store for vec instead of 4 load/store.
The float3_to_float4's output is different with float4_to_float3.
vec4 float3_to_float4 assembly code
float3_to_float4: ; @float3_to_float4
; BB#0: ; %entry
s_load_dword s2, s[0:1], 0x9
s_mov_b32 s4, SCRATCH_RSRC_DWORD0
s_mov_b32 s5, SCRATCH_RSRC_DWORD1
s_mov_b32 s6, -1
s_mov_b32 s8, s3
s_mov_b32 s7, 0xe8f000
s_waitcnt lgkmcnt(0)
v_mov_b32_e32 v0, s2
buffer_load_dword v2, v0, s[4:7], s8 offen
buffer_load_dword v3, v0, s[4:7], s8 offen offset:4
buffer_load_dword v4, v0, s[4:7], s8 offen offset:8
buffer_load_dword v0, v0, s[4:7], s8 offen offset:12
s_load_dword s0, s[0:1], 0xa
s_waitcnt lgkmcnt(0)
v_mov_b32_e32 v1, s0
s_waitcnt vmcnt(0)
buffer_store_dword v0, v1, s[4:7], s8 offen offset:12
buffer_store_dword v4, v1, s[4:7], s8 offen offset:8
buffer_store_dword v3, v1, s[4:7], s8 offen offset:4
buffer_store_dword v2, v1, s[4:7], s8 offen
s_endpgm
vec4 float3_to_float4 assembly code
float3_to_float4: ; @float3_to_float4
; BB#0: ; %entry
s_load_dword s2, s[0:1], 0x9
s_mov_b32 s4, SCRATCH_RSRC_DWORD0
s_mov_b32 s5, SCRATCH_RSRC_DWORD1
s_mov_b32 s6, -1
s_mov_b32 s8, s3
s_mov_b32 s7, 0xe8f000
s_waitcnt lgkmcnt(0)
v_mov_b32_e32 v0, s2
buffer_load_dword v2, v0, s[4:7], s8 offen
buffer_load_dword v3, v0, s[4:7], s8 offen offset:4
buffer_load_dword v4, v0, s[4:7], s8 offen offset:8
buffer_load_dword v0, v0, s[4:7], s8 offen offset:12
s_load_dword s0, s[0:1], 0xa
s_waitcnt lgkmcnt(0)
v_mov_b32_e32 v1, s0
s_waitcnt vmcnt(0)
buffer_store_dword v0, v1, s[4:7], s8 offen offset:12
buffer_store_dword v4, v1, s[4:7], s8 offen offset:8
buffer_store_dword v3, v1, s[4:7], s8 offen offset:4
buffer_store_dword v2, v1, s[4:7], s8 offen
s_endpgm
The output is same for float3_to_float4 between vec3 and vec4 because codegen legalizes the type with widen vector according to alignment. In float4_to_float3 case, codegen legalizes the store's vec3 type differently and in the end, it generates 3 load/store. As a result, I think we could generate above vec3 IR or similar one with option and it could be better output for GPU. On previous comments, I was wrong for "ScalarExprEmitter::VisitAsTypeExpr". I am sorry for that. If you agree with above vec3 IRs, I will add it on this patch. If I missed something or you have another idea or opinion, please let me know.
https://reviews.llvm.org/D30810
More information about the cfe-commits
mailing list