[LLVMbugs] [Bug 11836] New: Poor code generated for float<->int conversion when NEON enabled

Mon Jan 23 10:08:40 PST 2012

http://llvm.org/bugs/show_bug.cgi?id=11836

             Bug #: 11836
           Summary: Poor code generated for float<->int conversion when
                    NEON enabled
           Product: libraries
           Version: trunk
          Platform: PC
        OS/Version: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: Backend: ARM
        AssignedTo: unassignedbugs at nondot.org
        ReportedBy: james.molloy at arm.com
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified

Created attachment 7933
  --> http://llvm.org/bugs/attachment.cgi?id=7933
Testcase. Compile with llc -mtriple armv7-none-gnueabi -mattr=+neon test.ll

When the attached testcase is compiled with NEON enabled, it produces extremely
bloated code.

$ cat test.ll
target datalayout =
"e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
target triple = "armv7-none-linux-gnueabi"

define float @f(<4 x i8>* nocapture %in) nounwind {
  %1 = load <4 x i8>* %in, align 4
  %2 = extractelement <4 x i8> %1, i32 0
  %3 = uitofp i8 %2 to float
  %4 = extractelement <4 x i8> %1, i32 1
  %5 = uitofp i8 %4 to float
  %6 = extractelement <4 x i8> %1, i32 2
  %7 = uitofp i8 %6 to float

  %8 = fadd float %3, %5
  %9 = fadd float %8, %7

  ret float %9      
}

$ llc -mtriple armv7-none-gnueabi -mattr=+neon  -o - test.ll
... snip ...
f:                                      @ @f
@ BB#0:
    sub    sp, sp, #8
    ldrb    r1, [r0]
    ldrb    r2, [r0, #1]
    ldrb    r3, [r0, #2]
    ldrb    r0, [r0, #3]
    strh    r0, [sp, #6]
    strh    r3, [sp, #4]
    strh    r2, [sp, #2]
    strh    r1, [sp]
    vldr    d16, [sp]
    vmov.u16    r1, d16[1]
    vmov.u16    r2, d16[0]
    vmov.u16    r0, d16[2]
    uxtb    r1, r1
    uxtb    r2, r2
    uxtb    r0, r0
    vmov    s1, r1
    vmov    s0, r2
    vmov    s2, r0
    vcvt.f32.u32    s3, s0
    vcvt.f32.u32    s1, s1
    vcvt.f32.u32    s0, s2
    vadd.f32    s1, s3, s1
    vadd.f32    s0, s1, s0
    vmov    r0, s0
    add    sp, sp, #8
    bx    lr
.Ltmp0:
    .size    f, .Ltmp0-f

That code has unnecessary moves to and from the vector unit, followed by a
completely redundant unsigned extend. When compiling for vfp (no neon), decent
code is generated:

f:                                      @ @f
@ BB#0:
    ldrb    r1, [r0]
    ldrb    r2, [r0, #1]
    ldrb    r0, [r0, #2]
    vmov    s0, r2
    vmov    s2, r1
    vmov    s1, r0
    vcvt.f32.u32    s0, s0
    vcvt.f32.u32    s2, s2
    vcvt.f32.u32    s1, s1
    vadd.f32    s0, s2, s0
    vadd.f32    s0, s0, s1
    vmov    r0, s0
    bx    lr

-- 
Configure bugmail: http://llvm.org/bugs/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are on the CC list for the bug.