[llvm-bugs] [Bug 28006] New: Two equivalent values not folded in loop
via llvm-bugs
llvm-bugs at lists.llvm.org
Sat Jun 4 16:20:58 PDT 2016
https://llvm.org/bugs/show_bug.cgi?id=28006
Bug ID: 28006
Summary: Two equivalent values not folded in loop
Product: libraries
Version: trunk
Hardware: PC
OS: All
Status: NEW
Severity: normal
Priority: P
Component: Scalar Optimizations
Assignee: unassignedbugs at nondot.org
Reporter: code at klickverbot.at
CC: llvm-bugs at lists.llvm.org
Classification: Unclassified
Consider the following function – a simple straight-line loop for selecting the
minimal element from a range of i32s –, which apart from loop unrolling is a
fixed point for the default optimizer pipeline (opt -O3
-disable-loop-unrolling/…) on current master:
---
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx"
; Function Attrs: noinline norecurse nounwind readonly uwtable
define i32 @foo({ i64, i32* } %r_arg, i32 %seedElement_arg) #0 {
%1 = extractvalue { i64, i32* } %r_arg, 0
%2 = extractvalue { i64, i32* } %r_arg, 1
%3 = icmp eq i64 %1, 0
br i1 %3, label %endfor, label %forbody
forbody: ; preds = %0, %forbody
%extremeElement.0 = phi i32 [ %.extremeElement.0, %forbody ], [
%seedElement_arg, %0 ]
%extremeElementMapped.0 = phi i32 [ %.extremeElementMapped.0, %forbody ], [
%seedElement_arg, %0 ]
%__key2831.02 = phi i64 [ %7, %forbody ], [ 0, %0 ]
%4 = getelementptr i32, i32* %2, i64 %__key2831.02
%5 = load i32, i32* %4, align 4
%6 = icmp slt i32 %5, %extremeElementMapped.0
%.extremeElement.0 = select i1 %6, i32 %5, i32 %extremeElement.0
%.extremeElementMapped.0 = select i1 %6, i32 %5, i32 %extremeElementMapped.0
%7 = add nuw i64 %__key2831.02, 1
%exitcond = icmp eq i64 %7, %1
br i1 %exitcond, label %endfor, label %forbody
endfor: ; preds = %forbody, %0
%extremeElement.1 = phi i32 [ %seedElement_arg, %0 ], [ %.extremeElement.0,
%forbody ]
ret i32 %extremeElement.1
}
attributes #0 = { noinline norecurse nounwind readonly uwtable
"target-cpu"="haswell"
"target-features"="+sse2,+cx16,-tbm,-avx512ifma,-avx512dq,-fma4,-prfchw,+bmi2,-xsavec,+fsgsbase,+popcnt,+aes,-pcommit,-xsaves,-avx512er,-clwb,-avx512f,-pku,-smap,+mmx,-xop,-rdseed,-hle,-sse4a,-avx512bw,-clflushopt,+xsave,-avx512vl,+invpcid,-avx512cd,+avx,-rtm,+fma,+bmi,-mwaitx,+rdrnd,+sse4.1,+sse4.2,+avx2,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,-sgx,+cmov,-avx512vbmi,+movbe,+xsaveopt,-sha,-adx,-avx512pf,+sse3"
}
---
For reasons that aren't immediately obvious to me, %extremeElement.0 and
%extremeElementMapped.0 are not folded together, even though they are trivially
equivalent. This isn't caught during instruction selection, either (note the
two cmov instructions, ~30% slower on an i7-4980HQ):
---
_foo:
.cfi_startproc
test rdi, rdi
je LBB0_3
mov eax, edx
.p2align 4, 0x90
LBB0_2:
mov ecx, dword ptr [rsi]
cmp ecx, eax
cmovl edx, ecx
cmovle eax, ecx
add rsi, 4
add rdi, -1
jne LBB0_2
LBB0_3:
mov eax, edx
ret
.cfi_endproc
---
I disabled loop unrolling only for clarity, as it just makes matters worse (all
the selects are duplicated). Crucially, this also causes the loop vectorizer
not to trigger.
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20160604/f4d8d309/attachment.html>
More information about the llvm-bugs
mailing list