[LLVMdev] Vectorized LLVM IR

Sat May 29 01:18:04 PDT 2010

On Sat, May 29, 2010 at 12:42 AM, Stéphane Letz <letz at grame.fr> wrote:
>
> Le 29 mai 2010 à 01:08, Bill Wendling a écrit :
>
>> Hi Stéphane,
>>
>> The SSE support is the LLVM backend is fine. What is the code that's generated? Do you have some short examples of where LLVM doesn't do as well as the equivalent scalar code?
>>
>> -bw
>>
>> On May 28, 2010, at 12:13 PM, Stéphane Letz wrote:
>
>
> We are actually testing LLVM for the Faust language (http://faust.grame.fr/)
>
> Currently Faust generates à C++ class from its .dsp Faust source file. So for the simple following Faust example :
>
> process = (+,+):*;
>
> Which can be displayed as the following processor (takes 4 streams of float samples, do a "+" and then a "*" operation on the streams to produce a single output)
>
>
>
>
>
> For scalar code in C++ code is :
>
> virtual void compute (int count, FAUSTFLOAT** input, FAUSTFLOAT** output) {
>                FAUSTFLOAT* input0 = input[0];
>                FAUSTFLOAT* input1 = input[1];
>                FAUSTFLOAT* input2 = input[2];
>                FAUSTFLOAT* input3 = input[3];
>                FAUSTFLOAT* output0 = output[0];
>                for (int i=0; i<count; i++) {
>                        output0[i] = (FAUSTFLOAT)(((float)input2[i] + (float)input3[i]) * ((float)input0[i] + (float)input1[i]));
>                }
>        }
>
> The "vectorized" C++ code is :
>
> virtual void compute (int fullcount, FAUSTFLOAT** input, FAUSTFLOAT** output) {
>                for (int index = 0; index < fullcount; index += 32) {
>                        int count = min(32, fullcount-index);
>                        FAUSTFLOAT* input0 = &input[0][index];
>                        FAUSTFLOAT* input1 = &input[1][index];
>                        FAUSTFLOAT* input2 = &input[2][index];
>                        FAUSTFLOAT* input3 = &input[3][index];
>                        FAUSTFLOAT* output0 = &output[0][index];
>                        // SECTION : 1
>                        for (int i=0; i<count; i++) {
>                                output0[i] = (FAUSTFLOAT)(((float)input2[i] + (float)input3[i]) * ((float)input0[i] + (float)input1[i]));
>                        }
>                }
>        }
>
> (so basically the C++ code is separated in "vectors" [here 32 samples] to be computed in separated loops that can be auto-vectorized by some compilers like Intel ICC, this works quite well...)
>
> The scalar LLVM code is :
>
> define void @llvm_compute(%struct.llvm_dsp*  %obj, i32 %count, float** noalias %inputs, float** noalias %outputs) nounwind readnone ssp {
>        entry:
>            %input_array_ptr0 = getelementptr inbounds float** %inputs, i64 0
>            %input0 = load float** %input_array_ptr0, align 8
>            %input_array_ptr1 = getelementptr inbounds float** %inputs, i64 1
>            %input1 = load float** %input_array_ptr1, align 8
>            %input_array_ptr2 = getelementptr inbounds float** %inputs, i64 2
>            %input2 = load float** %input_array_ptr2, align 8
>            %input_array_ptr3 = getelementptr inbounds float** %inputs, i64 3
>            %input3 = load float** %input_array_ptr3, align 8
>            %output_array_ptr0 = getelementptr inbounds float** %outputs, i64 0
>            %output0 = load float** %output_array_ptr0, align 8
>        %out = icmp sgt i32 %count, 0
>        br i1 %out, label %convert, label %return
>        convert:
>                %count_64 = zext i32 %count to i64
>                br label %loop
>        loop:
>                %indvar = phi i64 [ 0, %convert ], [ %indvar.next, %loop ]
>                %output_ptr0 = getelementptr float* %output0, i64 %indvar
>                %input_ptr1 = getelementptr float* %input1, i64 %indvar
>                %fTemp0 = load float* %input_ptr1, align 4
>                %input_ptr0 = getelementptr float* %input0, i64 %indvar
>                %fTemp1 = load float* %input_ptr0, align 4
>                %fTemp2 = fadd float %fTemp1, %fTemp0
>                %input_ptr3 = getelementptr float* %input3, i64 %indvar
>                %fTemp3 = load float* %input_ptr3, align 4
>                %input_ptr2 = getelementptr float* %input2, i64 %indvar
>                %fTemp4 = load float* %input_ptr2, align 4
>                %fTemp5 = fadd float %fTemp4, %fTemp3
>                %fTemp6 = fmul float %fTemp5, %fTemp2
>                store float %fTemp6, float* %output_ptr0, align 4
>                %indvar.next = add i64 %indvar, 1
>                %exitcond = icmp eq i64 %indvar.next, %count_64
>                br i1 %exitcond, label %return, label %loop
>        return:
>                ret void
> }
>
>
> And the vectorized LLVM code is :
>
> define void @llvm_compute(%struct.llvm_dsp* noalias %obj, i32 %count, <32 x float>** noalias %inputs, <32 x float>** noalias %outputs) nounwind readnone ssp {
>                entry:
>                    %input_array_ptr0 = getelementptr inbounds <32 x float>** %inputs, i64 0
>                    %input0 = load <32 x float>** %input_array_ptr0
>                    %input_array_ptr1 = getelementptr inbounds <32 x float>** %inputs, i64 1
>                    %input1 = load <32 x float>** %input_array_ptr1
>                    %input_array_ptr2 = getelementptr inbounds <32 x float>** %inputs, i64 2
>                    %input2 = load <32 x float>** %input_array_ptr2
>                    %input_array_ptr3 = getelementptr inbounds <32 x float>** %inputs, i64 3
>                    %input3 = load <32 x float>** %input_array_ptr3
>                    %output_array_ptr0 = getelementptr inbounds <32 x float>** %outputs, i64 0
>                    %output0 = load <32 x float>** %output_array_ptr0
>                    %out = icmp sgt i32 %count, 0
>                    br i1 %out, label %convert, label %return
>                convert:
>                        %count_64 = zext i32 %count to i64
>                        br label %loop0
>                loop0:
>                        %indvar = phi i64 [ 0, %convert ], [ %indvar.next, %loop0 ]
>                        %output_ptr0 = getelementptr <32 x float>* %output0, i64 %indvar
>                        %input_ptr1 = getelementptr <32 x float>* %input1, i64 %indvar
>                        %fVector0 = load <32 x float>* %input_ptr1, align 16;
>                        %input_ptr0 = getelementptr <32 x float>* %input0, i64 %indvar
>                        %fVector1 = load <32 x float>* %input_ptr0, align 16;
>                        %fVector2 = fadd <32 x float> %fVector1, %fVector0;
>                        %input_ptr3 = getelementptr <32 x float>* %input3, i64 %indvar
>                        %fVector3 = load <32 x float>* %input_ptr3, align 16;
>                        %input_ptr2 = getelementptr <32 x float>* %input2, i64 %indvar
>                        %fVector4 = load <32 x float>* %input_ptr2, align 16;
>                        %fVector5 = fadd <32 x float> %fVector4, %fVector3;
>                        %fVector6 = fmul <32 x float> %fVector5, %fVector2;
>                        store <32 x float> %fVector6, <32 x float>* %output_ptr0, align 16
>
>                        %indvar.next = add i64 %indvar, 1
>                        %exitcond = icmp eq i64 %indvar.next, %count_64
>                        br i1 %exitcond, label %return, label %loop0
>                return:
>                        ret void
> }
>
> We tried to play with the "align" on the load/store or "noalias" on the compute function parameters without real change.
>
> Do you see anything clear that not correct in the generated vectorized LLVM code? Maybe the memory bandwidth is the limiting factor in this simple example without much computation on the samples?

<32 x float> takes up 8 SSE registers; you're likely running into
issues with register pressure.  Does it work better if you use
something smaller like <4 x float>?

Besides that, I don't see any obvious issues.

-Eli