[llvm-bugs] [Bug 28006] New: Two equivalent values not folded in loop

Sat Jun 4 16:20:58 PDT 2016

https://llvm.org/bugs/show_bug.cgi?id=28006

            Bug ID: 28006
           Summary: Two equivalent values not folded in loop
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: All
            Status: NEW
          Severity: normal
          Priority: P
         Component: Scalar Optimizations
          Assignee: unassignedbugs at nondot.org
          Reporter: code at klickverbot.at
                CC: llvm-bugs at lists.llvm.org
    Classification: Unclassified

Consider the following function – a simple straight-line loop for selecting the
minimal element from a range of i32s –, which apart from loop unrolling is a
fixed point for the default optimizer pipeline (opt -O3
-disable-loop-unrolling/…) on current master:

---
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx"

; Function Attrs: noinline norecurse nounwind readonly uwtable
define i32 @foo({ i64, i32* } %r_arg, i32 %seedElement_arg) #0 {
  %1 = extractvalue { i64, i32* } %r_arg, 0
  %2 = extractvalue { i64, i32* } %r_arg, 1
  %3 = icmp eq i64 %1, 0
  br i1 %3, label %endfor, label %forbody

forbody:                                          ; preds = %0, %forbody
  %extremeElement.0 = phi i32 [ %.extremeElement.0, %forbody ], [
%seedElement_arg, %0 ]
  %extremeElementMapped.0 = phi i32 [ %.extremeElementMapped.0, %forbody ], [
%seedElement_arg, %0 ]
  %__key2831.02 = phi i64 [ %7, %forbody ], [ 0, %0 ]
  %4 = getelementptr i32, i32* %2, i64 %__key2831.02
  %5 = load i32, i32* %4, align 4
  %6 = icmp slt i32 %5, %extremeElementMapped.0
  %.extremeElement.0 = select i1 %6, i32 %5, i32 %extremeElement.0
  %.extremeElementMapped.0 = select i1 %6, i32 %5, i32 %extremeElementMapped.0
  %7 = add nuw i64 %__key2831.02, 1
  %exitcond = icmp eq i64 %7, %1
  br i1 %exitcond, label %endfor, label %forbody

endfor:                                           ; preds = %forbody, %0
  %extremeElement.1 = phi i32 [ %seedElement_arg, %0 ], [ %.extremeElement.0,
%forbody ]
  ret i32 %extremeElement.1
}

attributes #0 = { noinline norecurse nounwind readonly uwtable
"target-cpu"="haswell"
"target-features"="+sse2,+cx16,-tbm,-avx512ifma,-avx512dq,-fma4,-prfchw,+bmi2,-xsavec,+fsgsbase,+popcnt,+aes,-pcommit,-xsaves,-avx512er,-clwb,-avx512f,-pku,-smap,+mmx,-xop,-rdseed,-hle,-sse4a,-avx512bw,-clflushopt,+xsave,-avx512vl,+invpcid,-avx512cd,+avx,-rtm,+fma,+bmi,-mwaitx,+rdrnd,+sse4.1,+sse4.2,+avx2,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,-sgx,+cmov,-avx512vbmi,+movbe,+xsaveopt,-sha,-adx,-avx512pf,+sse3"
}
---

For reasons that aren't immediately obvious to me, %extremeElement.0 and
%extremeElementMapped.0 are not folded together, even though they are trivially
equivalent. This isn't caught during instruction selection, either (note the
two cmov instructions, ~30% slower on an i7-4980HQ):

---
_foo:
    .cfi_startproc
    test    rdi, rdi
    je    LBB0_3
    mov    eax, edx
    .p2align    4, 0x90
LBB0_2:
    mov    ecx, dword ptr [rsi]
    cmp    ecx, eax
    cmovl    edx, ecx
    cmovle    eax, ecx
    add    rsi, 4
    add    rdi, -1
    jne    LBB0_2
LBB0_3:
    mov    eax, edx
    ret
    .cfi_endproc
---

I disabled loop unrolling only for clarity, as it just makes matters worse (all
the selects are duplicated). Crucially, this also causes the loop vectorizer
not to trigger.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20160604/f4d8d309/attachment.html>