[LLVMdev] Vectorized LLVM IR

Sat May 29 00:42:15 PDT 2010

Le 29 mai 2010 à 01:08, Bill Wendling a écrit :

> Hi Stéphane,
> 
> The SSE support is the LLVM backend is fine. What is the code that's generated? Do you have some short examples of where LLVM doesn't do as well as the equivalent scalar code?
> 
> -bw
> 
> On May 28, 2010, at 12:13 PM, Stéphane Letz wrote:

We are actually testing LLVM for the Faust language (http://faust.grame.fr/)

Currently Faust generates à C++ class from its .dsp Faust source file. So for the simple following Faust example : 

process = (+,+):*;

Which can be displayed as the following processor (takes 4 streams of float samples, do a "+" and then a "*" operation on the streams to produce a single output)

-------------- next part --------------
A non-text attachment was scrubbed...
Name: plus.png
Type: image/png
Size: 10191 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20100529/b44926fc/attachment.png>
-------------- next part --------------

For scalar code in C++ code is :

virtual void compute (int count, FAUSTFLOAT** input, FAUSTFLOAT** output) {
		FAUSTFLOAT* input0 = input[0];
		FAUSTFLOAT* input1 = input[1];
		FAUSTFLOAT* input2 = input[2];
		FAUSTFLOAT* input3 = input[3];
		FAUSTFLOAT* output0 = output[0];
		for (int i=0; i<count; i++) {
			output0[i] = (FAUSTFLOAT)(((float)input2[i] + (float)input3[i]) * ((float)input0[i] + (float)input1[i]));
		}
	}

The "vectorized" C++ code is : 

virtual void compute (int fullcount, FAUSTFLOAT** input, FAUSTFLOAT** output) {
		for (int index = 0; index < fullcount; index += 32) {
			int count = min(32, fullcount-index);
			FAUSTFLOAT* input0 = &input[0][index];
			FAUSTFLOAT* input1 = &input[1][index];
			FAUSTFLOAT* input2 = &input[2][index];
			FAUSTFLOAT* input3 = &input[3][index];
			FAUSTFLOAT* output0 = &output[0][index];
			// SECTION : 1
			for (int i=0; i<count; i++) {
				output0[i] = (FAUSTFLOAT)(((float)input2[i] + (float)input3[i]) * ((float)input0[i] + (float)input1[i]));
			}
		}
	}

(so basically the C++ code is separated in "vectors" [here 32 samples] to be computed in separated loops that can be auto-vectorized by some compilers like Intel ICC, this works quite well...)

The scalar LLVM code is : 

define void @llvm_compute(%struct.llvm_dsp*  %obj, i32 %count, float** noalias %inputs, float** noalias %outputs) nounwind readnone ssp {
	entry:
	    %input_array_ptr0 = getelementptr inbounds float** %inputs, i64 0
	    %input0 = load float** %input_array_ptr0, align 8
	    %input_array_ptr1 = getelementptr inbounds float** %inputs, i64 1
	    %input1 = load float** %input_array_ptr1, align 8
	    %input_array_ptr2 = getelementptr inbounds float** %inputs, i64 2
	    %input2 = load float** %input_array_ptr2, align 8
	    %input_array_ptr3 = getelementptr inbounds float** %inputs, i64 3
	    %input3 = load float** %input_array_ptr3, align 8
	    %output_array_ptr0 = getelementptr inbounds float** %outputs, i64 0
	    %output0 = load float** %output_array_ptr0, align 8
	%out = icmp sgt i32 %count, 0
	br i1 %out, label %convert, label %return
	convert:
		%count_64 = zext i32 %count to i64
		br label %loop
	loop:
		%indvar = phi i64 [ 0, %convert ], [ %indvar.next, %loop ]
		%output_ptr0 = getelementptr float* %output0, i64 %indvar
		%input_ptr1 = getelementptr float* %input1, i64 %indvar
		%fTemp0 = load float* %input_ptr1, align 4
		%input_ptr0 = getelementptr float* %input0, i64 %indvar
		%fTemp1 = load float* %input_ptr0, align 4
		%fTemp2 = fadd float %fTemp1, %fTemp0
		%input_ptr3 = getelementptr float* %input3, i64 %indvar
		%fTemp3 = load float* %input_ptr3, align 4
		%input_ptr2 = getelementptr float* %input2, i64 %indvar
		%fTemp4 = load float* %input_ptr2, align 4
		%fTemp5 = fadd float %fTemp4, %fTemp3
		%fTemp6 = fmul float %fTemp5, %fTemp2
		store float %fTemp6, float* %output_ptr0, align 4
		%indvar.next = add i64 %indvar, 1
		%exitcond = icmp eq i64 %indvar.next, %count_64
		br i1 %exitcond, label %return, label %loop
	return:
		ret void
}

And the vectorized LLVM code is : 

define void @llvm_compute(%struct.llvm_dsp* noalias %obj, i32 %count, <32 x float>** noalias %inputs, <32 x float>** noalias %outputs) nounwind readnone ssp {
		entry:
		    %input_array_ptr0 = getelementptr inbounds <32 x float>** %inputs, i64 0
		    %input0 = load <32 x float>** %input_array_ptr0
		    %input_array_ptr1 = getelementptr inbounds <32 x float>** %inputs, i64 1
		    %input1 = load <32 x float>** %input_array_ptr1
		    %input_array_ptr2 = getelementptr inbounds <32 x float>** %inputs, i64 2
		    %input2 = load <32 x float>** %input_array_ptr2
		    %input_array_ptr3 = getelementptr inbounds <32 x float>** %inputs, i64 3
		    %input3 = load <32 x float>** %input_array_ptr3
		    %output_array_ptr0 = getelementptr inbounds <32 x float>** %outputs, i64 0
		    %output0 = load <32 x float>** %output_array_ptr0
		    %out = icmp sgt i32 %count, 0
		    br i1 %out, label %convert, label %return
		convert:
			%count_64 = zext i32 %count to i64
			br label %loop0
		loop0:
			%indvar = phi i64 [ 0, %convert ], [ %indvar.next, %loop0 ]
			%output_ptr0 = getelementptr <32 x float>* %output0, i64 %indvar
			%input_ptr1 = getelementptr <32 x float>* %input1, i64 %indvar
			%fVector0 = load <32 x float>* %input_ptr1, align 16;
			%input_ptr0 = getelementptr <32 x float>* %input0, i64 %indvar
			%fVector1 = load <32 x float>* %input_ptr0, align 16;
			%fVector2 = fadd <32 x float> %fVector1, %fVector0;
			%input_ptr3 = getelementptr <32 x float>* %input3, i64 %indvar
			%fVector3 = load <32 x float>* %input_ptr3, align 16;
			%input_ptr2 = getelementptr <32 x float>* %input2, i64 %indvar
			%fVector4 = load <32 x float>* %input_ptr2, align 16;
			%fVector5 = fadd <32 x float> %fVector4, %fVector3;
			%fVector6 = fmul <32 x float> %fVector5, %fVector2;
			store <32 x float> %fVector6, <32 x float>* %output_ptr0, align 16

			%indvar.next = add i64 %indvar, 1
			%exitcond = icmp eq i64 %indvar.next, %count_64
			br i1 %exitcond, label %return, label %loop0
		return:
			ret void
}

We tried to play with the "align" on the load/store or "noalias" on the compute function parameters without real change.

Do you see anything clear that not correct in the generated vectorized LLVM code? Maybe the memory bandwidth is the limiting factor in this simple example without much computation on the samples?

Thanks.

Stéphane Letz