[LLVMdev] Vector ops out of loops

Sat Jan 14 15:52:49 PST 2012

Hi all,

Is there any optimization pass which can move vector ops out of loops ?

For example:

typedef short short2 __attribute__((ext_vector_type(2)));
short2 a[50],b[50],c;

void test() {
   for (i=0; i<50; i++) {
     c.y += a[i].x * b[i].y;
   }
}

clang in -O3 gives me the following IR:

@i = common global i32 0, align 4
@a = common global [50 x <2 x i16>] zeroinitializer, align 4
@b = common global [50 x <2 x i16>] zeroinitializer, align 4
@c = common global <2 x i16> zeroinitializer, align 4

define void @test() nounwind {
entry:
   store i32 0, i32* @i, align 4
   %c.promoted = load <2 x i16>* @c, align 4
   br label %for.body

for.body:                                         ; preds = %entry, 
%for.body
   %inc7 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
   %0 = phi <2 x i16> [ %c.promoted, %entry ], [ %6, %for.body ]
   %storemerge6.off0 = phi i16 [ 0, %entry ], [ %extract.t, %for.body ]
   %arrayidx = getelementptr inbounds [50 x <2 x i16>]* @a, i16 0, i16 
%storemerge6.off0
   %1 = load <2 x i16>* %arrayidx, align 4
   %2 = extractelement <2 x i16> %1, i32 0
   %arrayidx1 = getelementptr inbounds [50 x <2 x i16>]* @b, i16 0, i16 
%storemerge6.off0
   %3 = load <2 x i16>* %arrayidx1, align 4
   %4 = extractelement <2 x i16> %3, i32 1
   %mul = mul i16 %4, %2
   %5 = extractelement <2 x i16> %0, i32 1
   %add = add i16 %5, %mul
   %6 = insertelement <2 x i16> %0, i16 %add, i32 1
   %inc = add nsw i32 %inc7, 1
   %cmp = icmp slt i32 %inc, 50
   %extract.t = trunc i32 %inc to i16
   br i1 %cmp, label %for.body, label %for.end

for.end:                                          ; preds = %for.body
   store i32 50, i32* @i, align 4
   store <2 x i16> %6, <2 x i16>* @c, align 4
   ret void
}

The store to "c" is efficiently moved out of the loop but insertelt and 
extractelt are not.
Because we are always accessing the second element of vector c, is it 
not more efficient to move extractelt to the loop entry and insertelt to 
the end ?

Ivan