[LLVMdev] Question about ARM/vfp/NEON code generation

Thu May 26 18:04:22 PDT 2011

I have a code generation question for ARM with VFP and NEON.

I am generating code for the following function as a test:

void FloatingPointTest(float f1, float f2, float f3) 
{
     float f4 = f1 * f2;
     if (f4 > f3)
          printf("%f\n",f2);
     else
          printf("%f\n",f3);
}

I have tried compiling with:

	1. -mfloat-abi=softfp and -mfpu=neon
	2. -mfloat-abi=hard and -mfpu=neon
	3. -mfloat-abi=softfp and -mfpu=vfp3
	4. -mfloat-abi=hard and -mfpu=vfp3

When I use --emit-llvm -c flags to generate bitcode, and then use llc to
generate ARM assembler, I have tried supplying these flag variations to
llc:

      5. llc -mattr=+neon
      6. llc -mattr=+vfp3

I am building for armv7-a.

In all cases, I get code that looks pretty very the same; its like what
is below. However, I am expecting to see instruction level differences
between the vfp3 and neon versions. When I do the same with gcc 4.2 I do
see differences in the generated code.

Am I mistaken in expecting to see a difference in NEON and VFP
instructions, is this my mistake, or is there something else going on
here? 

thanks,
-David

        .private_extern _FloatingPointTest
        .globl  _FloatingPointTest
        .align  2
_FloatingPointTest:                     @ @FloatingPointTest
@ BB#0:                                 @ %entry
        sub     sp, sp, #8
        str     lr, [sp, #4]
        str     r7, [sp]
        mov     r7, sp
        sub     sp, sp, #36
        str     r0, [r7, #-4]
        vmov    s0, r0
        str     r1, [r7, #-8]
        vmov    s1, r1
        str     r2, [r7, #-12]
        vmov    s2, r2
        vldr.32 s3, [r7, #-4]
        vldr.32 s4, [r7, #-8]
        vmul.f32        s3, s3, s4
        vstr.32 s3, [r7, #-16]
        vldr.32 s4, [r7, #-12]
        vcmpe.f32       s3, s4
        vmrs    apsr_nzcv, fpscr
        vstr.32 s0, [sp, #16]
        vstr.32 s2, [sp, #12]
        vstr.32 s1, [sp, #8]
        ble     LBB20_2
@ BB#1:                                 @ %bb
        vldr.32 s0, [r7, #-16]
        ldr     r0, LCPI20_0

LPC20_0:
        add     r0, pc, r0
        vcvt.f64.f32    d1, s0
        vmov    r1, r2, d1
        bl      _printf
        str     r0, [sp, #4]
        b       LBB20_3
LBB20_2:                                @ %bb1
        vldr.32 s0, [r7, #-12]
        ldr     r0, LCPI20_1

LPC20_1:
        add     r0, pc, r0
        vcvt.f64.f32    d1, s0
        vmov    r1, r2, d1
        bl      _printf
        str     r0, [sp]
LBB20_3:                                @ %bb2
@ BB#4:                                 @ %return
        mov     sp, r7
        ldr     r7, [sp]
        ldr     lr, [sp, #4]
        add     sp, sp, #8
        bx      lr
@ BB#5:
        .align  2
LCPI20_0:
        .long   L_.str107-(LPC20_0+8)

        .align  2
LCPI20_1:
        .long   L_.str107-(LPC20_1+8)