[PATCH] D35014: [X86] PR32755 : Improvement in CodeGen instruction selection for LEAs.

Wed Aug 9 08:20:31 PDT 2017

lsaba added a comment.

In https://reviews.llvm.org/D35014#835498, @lsaba wrote:

> In https://reviews.llvm.org/D35014#835240, @jbhateja wrote:
>
> > @ reviewers , kindly let me know if there are any more comments apart from last comment from lsaba.
> >  Thanks.
>
>
> Hi,
>  I ran the patch on several benchmarks to check performance, overall the changes look good, but there is a regression in one of the benchmarks (EEMBC/coremark-pro) caused by creating an undesired lea instruction instead of the previously created add instruction, I am working on creating a simple reproducer for the problem and would appreciate your patience.
>
> Thanks

The change in X86DAGToDAGISel::matchAddressBase is good when it allows us to git rid of extra lea/add instructions, or replace slow lea with fast lea, but in some cases it only replaces an add instruction with a lea instruction and since the throughput of add instruction is higher, we would prefer to keep the add instruction, for example, for the following IR:

  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-unknown-linux-gnu"
  ; Function Attrs: norecurse nounwind uwtable
  define void @foo() local_unnamed_addr #0 {
  entry:
    br i1 undef, label %BB2, label %BB1
  BB1:                                  ; preds = %entry
    %rem.us.1 = srem i32 undef, 65536
    br label %BB2
  BB2:      ; preds = %BB1, %entry
    %s = phi i32 [ undef, %entry ], [ %rem.us.1, %BB1 ]
    %a = phi i32 [ 1, %entry ], [ 0, %BB1 ]
    %mul1 = mul nsw i32 %s, %a
    %rem1 = srem i32 %mul1, 65536
    %add1 = add nsw i32 %rem1, %a
    %conv1 = trunc i32 %add1 to i16
    store i16 %conv1, i16* undef, align 2, !tbaa !1
    %add2 = add i32 %add1, %a
    %0 = trunc i32 %add2 to i16
    %conv2 = and i16 %0, 255
    store i16 %conv2, i16* undef, align 2, !tbaa !1
    ret void
  }
  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-
  math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="core-avx2" "target-
  features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="true" "use-soft-float"="false" }
  !llvm.ident = !{!0}
  !0 = !{!"clang version 6.0.0 (cfe/trunk 310239)"}
  !1 = !{!2, !2, i64 0}
  !2 = !{!"short", !3, i64 0}
  !3 = !{!"omnipotent char", !4, i64 0}
  !4 = !{!"Simple C/C++ TBAA"}

the originally generated code was:

  .LBB0_2:                                # %for.cond11.for.inc35_crit_edge.us.unr-lcssa
     movl	%eax, %ecx
     imull	%eax, %ecx
     movl	%ecx, %edx
     sarl	$31, %edx
     shrl	$16, %edx
     addl	%ecx, %edx
     andl	$-65536, %edx           # imm = 0xFFFF0000
     subl	%edx, %ecx
     addl	%eax, %ecx
     movw	%cx, (%rax)
     addl	%eax, %ecx
     movzbl	%cl, %eax
     movw	%ax, (%rax)
     retq

while the generated code now is

  movl	%eax, %ecx
  imull	%eax, %ecx
  movl	%ecx, %edx
  sarl	$31, %edx
  shrl	$16, %edx
  addl	%ecx, %edx
  andl	$-65536, %edx           # imm = 0xFFFF0000
  subl	%edx, %ecx
  leal	(%rcx,%rax), %edx
  movw	%dx, (%rax)
  leal	(%rcx,%rax,2), %eax
  movzbl	%al, %eax
  movw	%ax, (%rax)
  retq

Need to refine this optimization further to avoid such cases since the impact can be substantial if the code is in a hot loop for example

https://reviews.llvm.org/D35014