[cfe-dev] easiest way to "fix-up" LLVM types generated from Intel SIMD types?

Wed Jun 2 18:06:28 PDT 2010

On Wed, Jun 2, 2010 at 5:35 PM, David Tweed <david.tweed at gmail.com> wrote:
> Hi, I wonder if anyone has any ideas about the following issue in
> attempting to do things with llvm bitcode generated from C++: clang++
> (recent-ish trunk) even at O3 compiles the following code
>
> <<<<<<<<<<<<<<<
> uint64_t
> innerLoop(__m128i *dummy,__m128i *data,int length,int stride)
> {
>    __m128i sum=_mm_setzero_si128();
>    int i=0;
>    do{
>        sum=_mm_add_epi16(sum,data[i]);
>        i+=1;
>    }while(i<length);
>    *dummy=sum;
>    return 0;
> }
> <<<<<<<<<<<<<<<
> to
> <<<<<<<<<<<<<<<
> define i64 @_Z9innerLoopPDv2_xS0_ii(<2 x i64>* nocapture %dummy, <2 x
> i64>* nocapture %data, i32 %length, i32 %stride) nounwind {
> entry:
>  %tmp = icmp sgt i32 %length, 1                  ; <i1> [#uses=1]
>  %smax = select i1 %tmp, i32 %length, i32 1      ; <i32> [#uses=1]
>  br label %do.body
>
> do.body:                                          ; preds = %do.body, %entry
>  %sum.0 = phi <2 x i64> [ zeroinitializer, %entry ], [ %2, %do.body ]
> ; <<2 x i64>> [#uses=1]
>  %i.0 = phi i32 [ 0, %entry ], [ %add, %do.body ] ; <i32> [#uses=2]
>  %arrayidx = getelementptr <2 x i64>* %data, i32 %i.0 ; <<2 x i64>*> [#uses=1]
>  %tmp3 = load <2 x i64>* %arrayidx               ; <<2 x i64>> [#uses=1]
>  %0 = bitcast <2 x i64> %sum.0 to <8 x i16>      ; <<8 x i16>> [#uses=1]
>  %1 = bitcast <2 x i64> %tmp3 to <8 x i16>       ; <<8 x i16>> [#uses=1]
>  %add.i = add nsw <8 x i16> %0, %1               ; <<8 x i16>> [#uses=1]
>  %2 = bitcast <8 x i16> %add.i to <2 x i64>      ; <<2 x i64>> [#uses=2]
>  %add = add nsw i32 %i.0, 1                      ; <i32> [#uses=2]
>  %exitcond = icmp eq i32 %add, %smax             ; <i1> [#uses=1]
>  br i1 %exitcond, label %do.end, label %do.body
>
> do.end:                                           ; preds = %do.body
>  store <2 x i64> %2, <2 x i64>* %dummy
>  ret i64 0
> }
> <<<<<<<<<<<<<<<
>
> Notice the bitcasts to/from 2xi64 within the loop boyd. I'm assuming
> that they're there because Intel botched things by making having only
> one integer intrinsic type to anonymously cover all the different
> divisions into sub-integers but LLVM's design requires a definite
> subdivision, so they go back to canonical form as soon as possible.
> (Experiments show performing several operations on a ___m128i value in
> linear sequence doesn't reconvert the values in between). I know those
> conceptual bitcasts don't cost execution time, but for bitcode
> manipulation purposes they complicated things needlessly and I'd
> really prefer to remove them. I'm happy to use a specific typename
> like i16x8 rather than __m128i in the C++ source, but I'm not sure how
> to define them in such a way that it gets understood. Or would it be
> easier to try a different way of removing them from the produced
> bitcode?

You can define the appropriate vector type as follows:
typedef short vec8 __attribute((vector_size(16)));

-Eli