[llvm-dev] Suboptimal code generated by clang+llc in quite a common scenario (?)
Joan Lluch via llvm-dev
llvm-dev at lists.llvm.org
Thu Aug 8 08:18:56 PDT 2019
I found a something that I quite not understand when compiling a common piece of code using the -Os flags.
I found it while testing my own backend but then I got deeper and found that at least the x86 is affected as well. This is the referred code:
char pp[3];
char *scscx = pp;
int tst( char i, char j, char k )
{
scscx[0] = i;
scscx[1] = j;
scscx[2] = k;
return 0;
}
The above gets compiled for the x86 architecture like this:
; Function Attrs: nofree norecurse nounwind optsize uwtable
define i32 @tst(i8 signext %i, i8 signext %j, i8 signext %k) local_unnamed_addr #1 {
entry:
%0 = load i8*, i8** @scscx, align 8, !tbaa !11
store i8 %i, i8* %0, align 1, !tbaa !13
%1 = load i8*, i8** @scscx, align 8, !tbaa !11
%arrayidx1 = getelementptr inbounds i8, i8* %1, i64 1
store i8 %j, i8* %arrayidx1, align 1, !tbaa !13
%2 = load i8*, i8** @scscx, align 8, !tbaa !11
%arrayidx2 = getelementptr inbounds i8, i8* %2, i64 2
store i8 %k, i8* %arrayidx2, align 1, !tbaa !13
ret i32 0
}
According to that, the variable ‘scscx’ is loaded three times despite it’s never modified. The resulting assembly code is this:
.globl _tst
_tst:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset %ebp, -8
movl %esp, %ebp
.cfi_def_cfa_register %ebp
pushl %esi
.cfi_offset %esi, -12
movb 16(%ebp), %al
movb 12(%ebp), %cl
movb 8(%ebp), %dl
movl _scscx, %esi
movb %dl, (%esi)
movl _scscx, %edx
movb %cl, 1(%edx)
movl _scscx, %ecx
movb %al, 2(%ecx)
xorl %eax, %eax
popl %esi
popl %ebp
retl
.cfi_endproc
.comm _pp,3,0
.section __DATA,__data
.globl _scscx
.p2align 3
_scscx:
.long _pp
Again, the _scscx is loaded three times instead of reusing a register, which is suboptimal.
NOW, if I replace the original code by this:
int pp[3];
int *scscx = pp;
int tst( int i, int j, int k )
{
scscx[0] = i;
scscx[1] = j;
scscx[2] = k;
return 0;
}
I get the following:
; Function Attrs: nofree norecurse nounwind optsize uwtable
define i32 @tst(i32 %i, i32 %j, i32 %k) local_unnamed_addr #1 {
entry:
%0 = load i32*, i32** @scscx, align 8, !tbaa !11
store i32 %i, i32* %0, align 4, !tbaa !13
%arrayidx1 = getelementptr inbounds i32, i32* %0, i64 1
store i32 %j, i32* %arrayidx1, align 4, !tbaa !13
%arrayidx2 = getelementptr inbounds i32, i32* %0, i64 2
store i32 %k, i32* %arrayidx2, align 4, !tbaa !13
ret i32 0
}
.globl _tst
_tst:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset %ebp, -8
movl %esp, %ebp
.cfi_def_cfa_register %ebp
pushl %esi
.cfi_offset %esi, -12
movl 16(%ebp), %eax
movl 12(%ebp), %ecx
movl 8(%ebp), %edx
movl _scscx, %esi
movl %edx, (%esi)
movl %ecx, 4(%esi)
movl %eax, 8(%esi)
xorl %eax, %eax
popl %esi
popl %ebp
retl
.cfi_endproc
.comm _pp,12,2
.section __DATA,__data
.globl _scscx
.p2align 3
_scscx:
.long _pp
In this case the compiler optimises the load of _scscx into a register and reuses its value instead of loading the variable multiple times. This results in a cleaner and more optimal code, specially when compared with the first case.
I would like to understand why this happens, and whether there’s a way (or workaround) to improve it?
Should I file a bug report for that?
Thanks.
Joan
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20190808/2352599a/attachment-0001.html>
More information about the llvm-dev
mailing list