[llvm-dev] Suboptimal code generated by clang+llc in quite a common scenario (?)

Thu Aug 8 08:18:56 PDT 2019

I found a something that I quite not understand when compiling a common piece of code using the -Os flags. 
I found it while testing my own backend but then I got deeper and found that at least the x86 is affected as well. This is the referred code:

char pp[3];
char *scscx = pp;
int tst( char i, char j, char k )
{
  scscx[0] = i;
  scscx[1] = j; 
  scscx[2] = k;
  return 0;
}

The above gets compiled for the x86 architecture like this:

; Function Attrs: nofree norecurse nounwind optsize uwtable
define i32 @tst(i8 signext %i, i8 signext %j, i8 signext %k) local_unnamed_addr #1 {
entry:
  %0 = load i8*, i8** @scscx, align 8, !tbaa !11
  store i8 %i, i8* %0, align 1, !tbaa !13
  %1 = load i8*, i8** @scscx, align 8, !tbaa !11
  %arrayidx1 = getelementptr inbounds i8, i8* %1, i64 1
  store i8 %j, i8* %arrayidx1, align 1, !tbaa !13
  %2 = load i8*, i8** @scscx, align 8, !tbaa !11
  %arrayidx2 = getelementptr inbounds i8, i8* %2, i64 2
  store i8 %k, i8* %arrayidx2, align 1, !tbaa !13
  ret i32 0
}

According to that, the variable ‘scscx’ is loaded three times despite it’s never modified. The resulting assembly code is this:

	.globl	_tst
_tst:
	.cfi_startproc
	pushl	%ebp
	.cfi_def_cfa_offset 8
	.cfi_offset %ebp, -8
	movl	%esp, %ebp
	.cfi_def_cfa_register %ebp
	pushl	%esi
	.cfi_offset %esi, -12
	movb	16(%ebp), %al
	movb	12(%ebp), %cl
	movb	8(%ebp), %dl
	movl	_scscx, %esi
	movb	%dl, (%esi)
	movl	_scscx, %edx
	movb	%cl, 1(%edx)
	movl	_scscx, %ecx
	movb	%al, 2(%ecx)
	xorl	%eax, %eax
	popl	%esi
	popl	%ebp
	retl
	.cfi_endproc

	.comm	_pp,3,0
	.section	__DATA,__data
	.globl	_scscx
	.p2align	3
_scscx:
	.long	_pp

Again, the _scscx is loaded three times instead of reusing a register, which is suboptimal.

NOW, if I replace the original code by this:

int pp[3];
int *scscx = pp;
int tst( int i, int j, int k )
{
  scscx[0] = i;
  scscx[1] = j; 
  scscx[2] = k;
  return 0;
}

I get the following:

; Function Attrs: nofree norecurse nounwind optsize uwtable
define i32 @tst(i32 %i, i32 %j, i32 %k) local_unnamed_addr #1 {
entry:
  %0 = load i32*, i32** @scscx, align 8, !tbaa !11
  store i32 %i, i32* %0, align 4, !tbaa !13
  %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 1
  store i32 %j, i32* %arrayidx1, align 4, !tbaa !13
  %arrayidx2 = getelementptr inbounds i32, i32* %0, i64 2
  store i32 %k, i32* %arrayidx2, align 4, !tbaa !13
  ret i32 0
}

	.globl	_tst
_tst:
	.cfi_startproc
	pushl	%ebp
	.cfi_def_cfa_offset 8
	.cfi_offset %ebp, -8
	movl	%esp, %ebp
	.cfi_def_cfa_register %ebp
	pushl	%esi
	.cfi_offset %esi, -12
	movl	16(%ebp), %eax
	movl	12(%ebp), %ecx
	movl	8(%ebp), %edx
	movl	_scscx, %esi
	movl	%edx, (%esi)
	movl	%ecx, 4(%esi)
	movl	%eax, 8(%esi)
	xorl	%eax, %eax
	popl	%esi
	popl	%ebp
	retl
	.cfi_endproc

	.comm	_pp,12,2
	.section	__DATA,__data
	.globl	_scscx
	.p2align	3
_scscx:
	.long	_pp

In this case the compiler optimises the load of _scscx into a register and reuses its value instead of loading the variable multiple times. This results in a cleaner and more optimal code, specially when compared with the first case.

I would like to understand why this happens, and whether there’s a way (or workaround) to improve it?

Should I file a bug report for that?

Thanks.

Joan

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20190808/2352599a/attachment-0001.html>