[LLVMdev] Excessive register spilling in large automatically generated functions, such as is found in FFTW

Thu Jul 5 21:06:07 PDT 2012

Hi,

I've noticed that LLVM tends to generate suboptimal code and spill an
excessive amount of registers in large functions, such as in those
that are automatically generated by FFTW.

LLVM generates good code for a function that computes an 8-point
complex FFT, but from 16-point upwards, icc or gcc generates much
better code. Here is an example of a sequence of instructions from a
32-point FFT, compiled with clang/LLVM 3.1 for x86_64 with SSE:

        [...]
	movaps	32(%rdi), %xmm3
	movaps	48(%rdi), %xmm2
	movaps	%xmm3, %xmm1     ### <-- xmm3 mov'ed into xmm1
	movaps	%xmm3, %xmm4     ### <-- xmm3 mov'ed into xmm4
	addps	%xmm0, %xmm1
	movaps	%xmm1, -16(%rbp)        ## 16-byte Spill
	movaps	144(%rdi), %xmm3   ### <-- new data mov'ed into xmm3
        [...]

xmm3 loaded, duplicated into 2 registers, and then discarded as other
data is loaded into it. Can anyone shed some light on why this might
be happening?

Below is the code for a 32-point FFT that generated the above
instructions. Sorry I can't supply a smaller example, but this only
begins to happen with functions of this size.

Regards,
Anthony

//////////////////////  CODE /////////////////////////////////////////

#include <xmmintrin.h>

#define __INLINE static inline __attribute__((always_inline))
#define LOAD _mm_load_ps
#define STORE _mm_store_ps
#define ADD _mm_add_ps
#define SUB _mm_sub_ps
#define MULT _mm_mul_ps
#define STREAM _mm_stream_ps
#define SHUF _mm_shuffle_ps
#define VLIT4(a,b,c,d) _mm_set_ps(a,b,c,d)
#define SWAP(d) SHUF(d,d,_MM_SHUFFLE(2,3,0,1))
#define UNPACK2LO(a,b) SHUF(a,b,_MM_SHUFFLE(1,0,1,0))
#define UNPACK2HI(a,b) SHUF(a,b,_MM_SHUFFLE(3,2,3,2))
#define HALFBLEND(a,b) SHUF(a,b,_MM_SHUFFLE(3,2,1,0))

__INLINE void TX2(__m128 *a, __m128 *b) {
    __m128 TX2_t0 = UNPACK2LO(*a, *b);
		__m128 TX2_t1 = UNPACK2HI(*a,*b);
    *a = TX2_t0; *b = TX2_t1;
}
__INLINE void FMA(__m128 *Rd, __m128 Rn, __m128 Rm) {	*Rd = ADD(*Rd,
MULT(Rn,Rm)); }
__INLINE void FMS(__m128 *Rd, __m128 Rn, __m128 Rm) {	*Rd = SUB(*Rd,
MULT(Rn,Rm)); }
__INLINE __m128 SIGNIFY(__m128 a) {
	__m128 t1;
	t1 = _mm_xor_ps(a, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f));
	return t1;
}
__INLINE __m128 MULI(__m128 a) { 	return SWAP(SIGNIFY(a)); }
__INLINE __m128 MUL(__m128 d, __m128 re, __m128 im) {
  re = MULT(re, d);
  FMS(&re, im, SWAP(d));
  return re;
}
__INLINE __m128 MULJ(__m128 d, __m128 re, __m128 im) {
   re = MULT(re, d);
	 FMA(&re, im, SWAP(d));
  return re;
}
__INLINE void S_4(__m128 r0, __m128 r1, __m128 r2, __m128 r3, float
*o0, float *o1, float *o2, float *o3) { STORE(o0, r0);  STORE(o1, r1);
 STORE(o2, r2);  STORE(o3, r3); }
__INLINE void K_0(__m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
  __m128 uk, uk2, zk, zk_d;
	uk   = *r0;
	uk2  = *r1;
  zk   = ADD(*r2, *r3);
  zk_d = MULI(SUB(*r2, *r3));
  *r0 = ADD(uk, zk);
  *r2 = SUB(uk, zk);
  *r1 = SUB(uk2, zk_d);
  *r3 = ADD(uk2, zk_d);
}
__INLINE void K_N(__m128 re, __m128 im, __m128 *r0, __m128 *r1, __m128
*r2, __m128 *r3) {
	__m128 uk, uk2, zk_p, zk_n, zk, zk_d;

	uk   = *r0;
	uk2  = *r1;
  zk_p = MUL(*r2, re, im);
  zk_n = MULJ(*r3, re, im);
  zk   = ADD(zk_p, zk_n);
  zk_d = MULI(SUB(zk_p, zk_n));

  *r2 = SUB(uk, zk);
  *r0 = ADD(uk, zk);
  *r3 = ADD(uk2, zk_d);
  *r1 = SUB(uk2, zk_d);
}

__INLINE void L_4_4(const float *i0, const float *i1, const float *i2,
const float *i3, __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
  __m128 t0, t1, t2, t3, t4, t5, t6, t7;
  t0 = LOAD(i0);
  t1 = LOAD(i1);
  t2 = LOAD(i2);
  t3 = LOAD(i3);
  t4 = ADD(t0, t1);
  t5 = SUB(t0, t1);
  t6 = ADD(t2, t3);
  t7 = MULI(SUB(t2, t3));
  t0 = ADD(t4, t6);
  t2 = SUB(t4, t6);
  t1 = SUB(t5, t7);
  t3 = ADD(t5, t7);
    TX2(&t0,&t1);
    TX2(&t2,&t3);
    *r0 = t0;*r2 = t1;*r1 = t2;*r3 = t3;
}
__INLINE void L_2_2(const float *i0, const float *i1, const float *i2,
const float *i3,  __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
  __m128 t0, t1, t2, t3, t4, t5, t6, t7;
  t0 = LOAD(i0);
  t1 = LOAD(i1);
  t2 = LOAD(i2);
  t3 = LOAD(i3);

    t4 = ADD(t0, t1);
    t5 = SUB(t0, t1);
    t6 = ADD(t2, t3);
    t7 = SUB(t2, t3);
    TX2(&t4,&t5);
    TX2(&t6,&t7);
    *r0 = t4; *r2 = t5; *r1 = t6; *r3 = t7;

}
__INLINE void L_4_2(const float *i0, const float *i1, const float *i2,
const float *i3, __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
  __m128 t0, t1, t2, t3, t4, t5, t6, t7;
  t0 = LOAD(i0);
  t1 = LOAD(i1);
  t6 = LOAD(i2);
  t7 = LOAD(i3);
  t2 = HALFBLEND(t6, t7);
  t3 = HALFBLEND(t7, t6);

t4 = ADD(t0, t1);
  t5 = SUB(t0, t1);
  t6 = ADD(t2, t3);
  t7 = SUB(t2, t3);
  *r2 = UNPACK2HI(t4, t5);
  *r3 = UNPACK2HI(t6, t7);
 	t7 = MULI(t7);
  t0 = ADD(t4, t6);
  t2 = SUB(t4, t6);
  t1 = SUB(t5, t7);
  t3 = ADD(t5, t7);
  *r0 = UNPACK2LO(t0, t1);
  *r1 = UNPACK2LO(t2, t3);
}

void fft32(const float *in, float *out) {
  __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31;

  L_4_4(in+0,in+32,in+16,in+48,&r0_1,&r2_3,&r16_17,&r18_19);
  L_2_2(in+8,in+40,in+56,in+24,&r4_5,&r6_7,&r20_21,&r22_23);
  K_N(VLIT4(0.7071,0.7071,1,1),VLIT4(0.7071,-0.7071,0,-0),&r0_1,&r2_3,&r4_5,&r6_7);
  L_4_2(in+4,in+36,in+20,in+52,&r8_9,&r10_11,&r28_29,&r30_31);
  L_4_4(in+60,in+28,in+12,in+44,&r12_13,&r14_15,&r24_25,&r26_27);
  K_N(VLIT4(0.9238,0.9238,1,1),VLIT4(0.3826,-0.3826,0,-0),&r0_1,&r4_5,&r8_9,&r12_13);
  K_N(VLIT4(0.3826,0.3826,0.7071,0.7071),VLIT4(0.9238,-0.9238,0.7071,-0.7071),&r2_3,&r6_7,&r10_11,&r14_15);
  K_N(VLIT4(0.7071,0.7071,1,1),VLIT4(0.7071,-0.7071,0,-0),&r16_17,&r18_19,&r20_21,&r22_23);
  K_N(VLIT4(0.7071,0.7071,1,1),VLIT4(0.7071,-0.7071,0,-0),&r24_25,&r26_27,&r28_29,&r30_31);
  K_N(VLIT4(0.9807,0.9807,1,1),VLIT4(0.1950,-0.1950,0,-0),&r0_1,&r8_9,&r16_17,&r24_25);
  S_4(r0_1,r8_9,r16_17,r24_25,out+0,out+16,out+32,out+48);
  K_N(VLIT4(0.8314,0.8314,0.9238,0.9238),VLIT4(0.5555,-0.5555,0.3826,-0.3826),&r2_3,&r10_11,&r18_19,&r26_27);
  S_4(r2_3,r10_11,r18_19,r26_27,out+4,out+20,out+36,out+52);
  K_N(VLIT4(0.5555,0.5555,0.7071,0.7071),VLIT4(0.8314,-0.8314,0.7071,-0.7071),&r4_5,&r12_13,&r20_21,&r28_29);
  S_4(r4_5,r12_13,r20_21,r28_29,out+8,out+24,out+40,out+56);
  K_N(VLIT4(0.1950,0.1950,0.3826,0.3826),VLIT4(0.9807,-0.9807,0.9238,-0.9238),&r6_7,&r14_15,&r22_23,&r30_31);
  S_4(r6_7,r14_15,r22_23,r30_31,out+12,out+28,out+44,out+60);

}