[PATCH] D35014: [X86] PR32755 : Improvement in CodeGen instruction selection for LEAs.
Lama via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 9 08:20:31 PDT 2017
lsaba added a comment.
In https://reviews.llvm.org/D35014#835498, @lsaba wrote:
> In https://reviews.llvm.org/D35014#835240, @jbhateja wrote:
>
> > @ reviewers , kindly let me know if there are any more comments apart from last comment from lsaba.
> > Thanks.
>
>
> Hi,
> I ran the patch on several benchmarks to check performance, overall the changes look good, but there is a regression in one of the benchmarks (EEMBC/coremark-pro) caused by creating an undesired lea instruction instead of the previously created add instruction, I am working on creating a simple reproducer for the problem and would appreciate your patience.
>
> Thanks
The change in X86DAGToDAGISel::matchAddressBase is good when it allows us to git rid of extra lea/add instructions, or replace slow lea with fast lea, but in some cases it only replaces an add instruction with a lea instruction and since the throughput of add instruction is higher, we would prefer to keep the add instruction, for example, for the following IR:
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: norecurse nounwind uwtable
define void @foo() local_unnamed_addr #0 {
entry:
br i1 undef, label %BB2, label %BB1
BB1: ; preds = %entry
%rem.us.1 = srem i32 undef, 65536
br label %BB2
BB2: ; preds = %BB1, %entry
%s = phi i32 [ undef, %entry ], [ %rem.us.1, %BB1 ]
%a = phi i32 [ 1, %entry ], [ 0, %BB1 ]
%mul1 = mul nsw i32 %s, %a
%rem1 = srem i32 %mul1, 65536
%add1 = add nsw i32 %rem1, %a
%conv1 = trunc i32 %add1 to i16
store i16 %conv1, i16* undef, align 2, !tbaa !1
%add2 = add i32 %add1, %a
%0 = trunc i32 %add2 to i16
%conv2 = and i16 %0, 255
store i16 %conv2, i16* undef, align 2, !tbaa !1
ret void
}
attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-
math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-
features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="true" "use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = !{!"clang version 6.0.0 (cfe/trunk 310239)"}
!1 = !{!2, !2, i64 0}
!2 = !{!"short", !3, i64 0}
!3 = !{!"omnipotent char", !4, i64 0}
!4 = !{!"Simple C/C++ TBAA"}
the originally generated code was:
.LBB0_2: # %for.cond11.for.inc35_crit_edge.us.unr-lcssa
movl %eax, %ecx
imull %eax, %ecx
movl %ecx, %edx
sarl $31, %edx
shrl $16, %edx
addl %ecx, %edx
andl $-65536, %edx # imm = 0xFFFF0000
subl %edx, %ecx
addl %eax, %ecx
movw %cx, (%rax)
addl %eax, %ecx
movzbl %cl, %eax
movw %ax, (%rax)
retq
while the generated code now is
movl %eax, %ecx
imull %eax, %ecx
movl %ecx, %edx
sarl $31, %edx
shrl $16, %edx
addl %ecx, %edx
andl $-65536, %edx # imm = 0xFFFF0000
subl %edx, %ecx
leal (%rcx,%rax), %edx
movw %dx, (%rax)
leal (%rcx,%rax,2), %eax
movzbl %al, %eax
movw %ax, (%rax)
retq
Need to refine this optimization further to avoid such cases since the impact can be substantial if the code is in a hot loop for example
https://reviews.llvm.org/D35014
More information about the llvm-commits
mailing list