[llvm-dev] How to change CLang struct alignment behaviour?

Mon May 13 11:58:58 PDT 2019

Hi Tim,

I agree this should be “solved” in LLVM

Just as a matter of information, this is the test source code and the generated assembly for my target architecture (more comments below):

struct AA
{
  char n;
  char m;
  char j;
};

struct BB
{
  char n;
  char m;
  char j;
  char k;
};

extern void convertA( struct AA *a);
extern void convertB( struct BB *b);

void callConvertA()
{
  struct AA a = {3, 4};
  convertA( &a );
}

void callConvertB()
{
  struct BB b = {3, 4};
  convertB( &b );
}

callConvertA:                           ; @callConvertA
; %bb.0:                                ; %entry
	sub	SP, #4, SP
	mov	&.LcallConvertA.a, r0
	ld.sb	[r0, #2], r1
	st.b	r1, [SP, #2]
	ld.sb	[r0, #0], r1
	zext	r1, r1
	ld.sb	[r0, #1], r0
	zext	r0, r0
	bswap	r0, r0
	or	r0, r1, r0
	st.w	r0, [SP, #0]
	mov	SP, r0
	call	&convertA
	add	SP, #4, SP
	ret
.Lfunc_end0:
	.size	callConvertA, .Lfunc_end0-callConvertA
                                        ; -- End function
	.globl	callConvertB            ; -- Begin function callConvertB
	.p2align	1
	.type	callConvertB, at function
callConvertB:                           ; @callConvertB
; %bb.0:                                ; %entry
	sub	SP, #4, SP
	mov	#0, r0
	st.w	r0, [SP, #2]
	mov	#1027, r0
	st.w	r0, [SP, #0]
	mov	SP, r0
	call	&convertB
	add	SP, #4, SP
	ret
.Lfunc_end1:
	.size	callConvertB, .Lfunc_end1-callConvertB
                                        ; -- End function
	.type	.LcallConvertA.a, at object ; @callConvertA.a
	.section	.rodata,"a", at progbits
.LcallConvertA.a:
	.byte	3                       ; 0x3
	.byte	4                       ; 0x4
	.byte	0                       ; 0x0
	.size	.LcallConvertA.a, 3

Please note that for this architecture the destination operand is on the RIGHT HAND SIDE, this is important to know to read the assembly correctly.

“ld.sb" are signextend 8 bit loads
“st,b” are 8 bit trunc stores
“ld.w” are 16 bit word loads
“st.w” are 16 bit word stores

The generated code is functionally correct, but:

- For callConvertA  i8, i8, i8 loads + i8, i16 stores are generated. To get the value of two i8, i8 loads into a i16 store the “zext”+”bswap” (equivalent to shift) + “or” trick is performed
- For callConvertB i16, i16 loads + i16, i16 stores are generated.

The desired behaviour would be to have i16, i8 loads + i16, i8 stores for the callConvertA.

The only difference that I can spot by debugging the LLVM source code is that getMemcpyLoadsAndStores is called with align = 1 for callConvertA, but it is NOT called for callConvertB.

The Clang generated IR is this:

; ModuleID = 'add.c'
source_filename = "add.c"
target datalayout = "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16"
target triple = "cpu74"

%struct.AA = type { i8, i8, i8 }
%struct.BB = type { i8, i8, i8, i8 }

@callConvertA.a = private unnamed_addr constant %struct.AA { i8 3, i8 4, i8 0 }, align 1

; Function Attrs: minsize nounwind optsize
define dso_local void @callConvertA() local_unnamed_addr #0 {
entry:
  %a = alloca %struct.AA, align 1
  %0 = getelementptr inbounds %struct.AA, %struct.AA* %a, i16 0, i32 0
  call void @llvm.lifetime.start.p0i8(i64 3, i8* nonnull %0) #3
  call void @llvm.memcpy.p0i8.p0i8.i16(i8* nonnull align 1 %0, i8* align 1 getelementptr inbounds (%struct.AA, %struct.AA* @callConvertA.a, i16 0, i32 0), i16 3, i1 false)
  call void @convertA(%struct.AA* nonnull %a) #4
  call void @llvm.lifetime.end.p0i8(i64 3, i8* nonnull %0) #3
  ret void
}

; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1

; Function Attrs: argmemonly nounwind
declare void @llvm.memcpy.p0i8.p0i8.i16(i8* nocapture writeonly, i8* nocapture readonly, i16, i1) #1

; Function Attrs: minsize optsize
declare dso_local void @convertA(%struct.AA*) local_unnamed_addr #2

; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1

; Function Attrs: minsize nounwind optsize
define dso_local void @callConvertB() local_unnamed_addr #0 {
entry:
  %b = alloca i32, align 2
  %tmpcast = bitcast i32* %b to %struct.BB*
  %0 = bitcast i32* %b to i8*
  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #3
  store i32 1027, i32* %b, align 2
  call void @convertB(%struct.BB* nonnull %tmpcast) #4
  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #3
  ret void
}

; Function Attrs: minsize optsize
declare dso_local void @convertB(%struct.BB*) local_unnamed_addr #2

attributes #0 = { minsize nounwind optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }
attributes #2 = { minsize optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nounwind }
attributes #4 = { minsize nounwind optsize }

!llvm.module.flags = !{!0}
!llvm.ident = !{!1}

!0 = !{i32 1, !"wchar_size", i32 2}
!1 = !{!"clang version 7.0.1 (tags/RELEASE_701/final)"}   

John

Tel: 620 28 45 13

> On 13 May 2019, at 20:09, Tim Northover <t.p.northover at gmail.com> wrote:
> 
> Hi Joan,
> 
> On Mon, 13 May 2019 at 18:01, Joan Lluch <joan.lluch at icloud.com> wrote:
>> After looking at it a bit further, I think this is a Clang thing.  Clang issues “align 2” if the struct has at least one int (2 bytes), but also if the entire struct size is multiple of 2. For example a struct with 4 char members. In these cases the LLVM backend correctly creates word sized load/stores (2 bytes).
> 
> I'm slightly surprised that it happens based purely on size, but
> either way LLVM should be able to cope.
> 
>> The LLVM backend just follows what’s dictated by Clang regarding alignment and thus it creates 2 byte or 1 byte load/stores instructions accordingly. I have not found a way to override this in LLVM. Any suggestions are appreciated.
> 
> That sounds right, but I don't think it explains the shifts you
> described before. It should work out a lot better than what you're
> seeing. Specifically, a 3 byte struct (for example) ought to either
> lower to:
> 
>    load i16, load i8 + stores if your target can do misaligned i16 operations.
> 
> or
> 
>    load i8, load i8, load i8 + stores if not.
> 
> Neither of those involve shifting operations. I'd suggest breaking
> just after getMemcpyLoadsAndStores and using SelectionDAG::dump to see
> exactly what it's created. Then try to work out where that gets
> pessimized to shifts, because it's not normal.
> 
> Cheers.
> 
> Tim.

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20190513/2c20ac23/attachment.html>