[llvm-dev] How to change CLang struct alignment behaviour?
Joan Lluch via llvm-dev
llvm-dev at lists.llvm.org
Mon May 13 11:58:58 PDT 2019
Hi Tim,
I agree this should be “solved” in LLVM
Just as a matter of information, this is the test source code and the generated assembly for my target architecture (more comments below):
struct AA
{
char n;
char m;
char j;
};
struct BB
{
char n;
char m;
char j;
char k;
};
extern void convertA( struct AA *a);
extern void convertB( struct BB *b);
void callConvertA()
{
struct AA a = {3, 4};
convertA( &a );
}
void callConvertB()
{
struct BB b = {3, 4};
convertB( &b );
}
callConvertA: ; @callConvertA
; %bb.0: ; %entry
sub SP, #4, SP
mov &.LcallConvertA.a, r0
ld.sb [r0, #2], r1
st.b r1, [SP, #2]
ld.sb [r0, #0], r1
zext r1, r1
ld.sb [r0, #1], r0
zext r0, r0
bswap r0, r0
or r0, r1, r0
st.w r0, [SP, #0]
mov SP, r0
call &convertA
add SP, #4, SP
ret
.Lfunc_end0:
.size callConvertA, .Lfunc_end0-callConvertA
; -- End function
.globl callConvertB ; -- Begin function callConvertB
.p2align 1
.type callConvertB, at function
callConvertB: ; @callConvertB
; %bb.0: ; %entry
sub SP, #4, SP
mov #0, r0
st.w r0, [SP, #2]
mov #1027, r0
st.w r0, [SP, #0]
mov SP, r0
call &convertB
add SP, #4, SP
ret
.Lfunc_end1:
.size callConvertB, .Lfunc_end1-callConvertB
; -- End function
.type .LcallConvertA.a, at object ; @callConvertA.a
.section .rodata,"a", at progbits
.LcallConvertA.a:
.byte 3 ; 0x3
.byte 4 ; 0x4
.byte 0 ; 0x0
.size .LcallConvertA.a, 3
Please note that for this architecture the destination operand is on the RIGHT HAND SIDE, this is important to know to read the assembly correctly.
“ld.sb" are signextend 8 bit loads
“st,b” are 8 bit trunc stores
“ld.w” are 16 bit word loads
“st.w” are 16 bit word stores
The generated code is functionally correct, but:
- For callConvertA i8, i8, i8 loads + i8, i16 stores are generated. To get the value of two i8, i8 loads into a i16 store the “zext”+”bswap” (equivalent to shift) + “or” trick is performed
- For callConvertB i16, i16 loads + i16, i16 stores are generated.
The desired behaviour would be to have i16, i8 loads + i16, i8 stores for the callConvertA.
The only difference that I can spot by debugging the LLVM source code is that getMemcpyLoadsAndStores is called with align = 1 for callConvertA, but it is NOT called for callConvertB.
The Clang generated IR is this:
; ModuleID = 'add.c'
source_filename = "add.c"
target datalayout = "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16"
target triple = "cpu74"
%struct.AA = type { i8, i8, i8 }
%struct.BB = type { i8, i8, i8, i8 }
@callConvertA.a = private unnamed_addr constant %struct.AA { i8 3, i8 4, i8 0 }, align 1
; Function Attrs: minsize nounwind optsize
define dso_local void @callConvertA() local_unnamed_addr #0 {
entry:
%a = alloca %struct.AA, align 1
%0 = getelementptr inbounds %struct.AA, %struct.AA* %a, i16 0, i32 0
call void @llvm.lifetime.start.p0i8(i64 3, i8* nonnull %0) #3
call void @llvm.memcpy.p0i8.p0i8.i16(i8* nonnull align 1 %0, i8* align 1 getelementptr inbounds (%struct.AA, %struct.AA* @callConvertA.a, i16 0, i32 0), i16 3, i1 false)
call void @convertA(%struct.AA* nonnull %a) #4
call void @llvm.lifetime.end.p0i8(i64 3, i8* nonnull %0) #3
ret void
}
; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
; Function Attrs: argmemonly nounwind
declare void @llvm.memcpy.p0i8.p0i8.i16(i8* nocapture writeonly, i8* nocapture readonly, i16, i1) #1
; Function Attrs: minsize optsize
declare dso_local void @convertA(%struct.AA*) local_unnamed_addr #2
; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
; Function Attrs: minsize nounwind optsize
define dso_local void @callConvertB() local_unnamed_addr #0 {
entry:
%b = alloca i32, align 2
%tmpcast = bitcast i32* %b to %struct.BB*
%0 = bitcast i32* %b to i8*
call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #3
store i32 1027, i32* %b, align 2
call void @convertB(%struct.BB* nonnull %tmpcast) #4
call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #3
ret void
}
; Function Attrs: minsize optsize
declare dso_local void @convertB(%struct.BB*) local_unnamed_addr #2
attributes #0 = { minsize nounwind optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }
attributes #2 = { minsize optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nounwind }
attributes #4 = { minsize nounwind optsize }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
!0 = !{i32 1, !"wchar_size", i32 2}
!1 = !{!"clang version 7.0.1 (tags/RELEASE_701/final)"}
John
Tel: 620 28 45 13
> On 13 May 2019, at 20:09, Tim Northover <t.p.northover at gmail.com> wrote:
>
> Hi Joan,
>
> On Mon, 13 May 2019 at 18:01, Joan Lluch <joan.lluch at icloud.com> wrote:
>> After looking at it a bit further, I think this is a Clang thing. Clang issues “align 2” if the struct has at least one int (2 bytes), but also if the entire struct size is multiple of 2. For example a struct with 4 char members. In these cases the LLVM backend correctly creates word sized load/stores (2 bytes).
>
> I'm slightly surprised that it happens based purely on size, but
> either way LLVM should be able to cope.
>
>> The LLVM backend just follows what’s dictated by Clang regarding alignment and thus it creates 2 byte or 1 byte load/stores instructions accordingly. I have not found a way to override this in LLVM. Any suggestions are appreciated.
>
> That sounds right, but I don't think it explains the shifts you
> described before. It should work out a lot better than what you're
> seeing. Specifically, a 3 byte struct (for example) ought to either
> lower to:
>
> load i16, load i8 + stores if your target can do misaligned i16 operations.
>
> or
>
> load i8, load i8, load i8 + stores if not.
>
> Neither of those involve shifting operations. I'd suggest breaking
> just after getMemcpyLoadsAndStores and using SelectionDAG::dump to see
> exactly what it's created. Then try to work out where that gets
> pessimized to shifts, because it's not normal.
>
> Cheers.
>
> Tim.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20190513/2c20ac23/attachment.html>
More information about the llvm-dev
mailing list