[LLVMbugs] [Bug 16941] New: LLVM has performance regression between r173893 and r173901

Tue Aug 20 08:40:36 PDT 2013

http://llvm.org/bugs/show_bug.cgi?id=16941

            Bug ID: 16941
           Summary: LLVM has performance regression between r173893 and
                    r173901
           Product: tools
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: opt
          Assignee: unassignedbugs at nondot.org
          Reporter: ili.filippov at gmail.com
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified

Created attachment 11072
  --> http://llvm.org/bugs/attachment.cgi?id=11072&action=edit
To reproduce

This issue takes place because revision r173901 adds convertion from sext+and
to select. And it makes performance worse.

Let's look at file no_opt.s:

; ModuleID = 'no_opt.bt'
target datalayout =
"e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: nounwind readnone
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) #0

; Function Attrs: nounwind
define void @mandelbrot(i32 %maxIterations) #1 {
allocas:
  %internal_mask_memory = alloca <16 x i32>
  %maxIterations1 = alloca i32
  %i = alloca <16 x i32>
  store <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32
-1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x
i32>* %internal_mask_memory
  store i32 %maxIterations, i32* %maxIterations1
  store <16 x i32> zeroinitializer, <16 x i32>* %i, align 4
  br label %for_test

for_test:                                         ; preds = %for_loop, %allocas
  %i_load = load <16 x i32>* %i
  %maxIterations_load = load i32* %maxIterations1
  %maxIterations_load_broadcast_init = insertelement <16 x i32> undef, i32
%maxIterations_load, i32 0
  %maxIterations_load_broadcast = shufflevector <16 x i32>
%maxIterations_load_broadcast_init, <16 x i32> undef, <16 x i32>
zeroinitializer
  %less_i_load_maxIterations_load_broadcast = icmp slt <16 x i32> %i_load,
%maxIterations_load_broadcast
  %less_i_load_maxIterations_load_broadcast_to_i32 = sext <16 x i1>
%less_i_load_maxIterations_load_broadcast to <16 x i32>
  %load_mask7 = load <16 x i32>* %internal_mask_memory
  %"oldMask&test" = and <16 x i32> %load_mask7,
%less_i_load_maxIterations_load_broadcast_to_i32
  store <16 x i32> %"oldMask&test", <16 x i32>* %internal_mask_memory
  %floatmask.i123 = bitcast <16 x i32> %"oldMask&test" to <16 x float>
  %mask0.i124 = shufflevector <16 x float> %floatmask.i123, <16 x float> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %v0.i125 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0.i124) #0
  %mask1.i126 = shufflevector <16 x float> %floatmask.i123, <16 x float> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %v1.i127 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1.i126) #0
  %v1shift.i128 = shl i32 %v1.i127, 8
  %v.i129 = or i32 %v1shift.i128, %v0.i125
  %cmp.i130 = icmp ne i32 %v.i129, 0
  br i1 %cmp.i130, label %for_loop, label %for_exit

for_loop:                                         ; preds = %for_test
  %i_load17 = load <16 x i32>* %i
  %i_load17_plus1 = add <16 x i32> %i_load17, <i32 1, i32 1, i32 1, i32 1, i32
1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  store <16 x i32> %i_load17_plus1, <16 x i32>* %i, align 4
  br label %for_test

for_exit:                                         ; preds = %for_test
  ret void
}

Problem is in these strings:

  %less_i_load_maxIterations_load_broadcast_to_i32 = sext <16 x i1>
%less_i_load_maxIterations_load_broadcast to <16 x i32>
  %load_mask7 = load <16 x i32>* %internal_mask_memory
  %"oldMask&test" = and <16 x i32> %load_mask7,
%less_i_load_maxIterations_load_broadcast_to_i32

Revision r173901 converts them to:

  %"oldMask&test" = select <16 x i1> %less_i_load_maxIterations_load_broadcast,
<16 x i32> %internal_mask_memory.0, <16 x i32> zeroinitializer

which is worse than revision r173893 which converts them to:

  %less_i_load_maxIterations_load_broadcast_to_i32 = sext <16 x i1>
%less_i_load_maxIterations_load_broadcast to <16 x i32>
  %"oldMask&test" = and <16 x i32> %internal_mask_memory.0,
%less_i_load_maxIterations_load_broadcast_to_i32

How to reproduce:
File mandelbrot.cpp is in attachment.
clang++ mandelbrot.cpp -O0 -m64 -c -o mandelbrot.o
llvm-as no_opt.s -o no_opt1.bt
bin-r173901/bin/opt -O2 no_opt1.bt > sel_opt.bt
bin-r173893/bin/opt -O2 no_opt1.bt > sext_opt.bt
llc -O0 sel_opt.bt -o sel_opt.s
llc -O0 sext_opt.bt -o sext_opt.s
clang -O0 -c sel_opt.s -o sel.o
clang -O0 -c sext_opt.s -o sext.o
clang++ -O0 mandelbrot.o sel.o -o mandelbrot_sel
clang++ -O0 mandelbrot.o sext.o -o mandelbrot_sext
./mandeldrot_sel
./mandelbrot_sext

Where bin-r173901/bin/opt is path to appropriate revision.
We will see that the first variant is slower than the second one.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20130820/8e7bf95b/attachment.html>