[LLVMdev] Vectorized LLVM IR

Sat May 29 01:31:17 PDT 2010

> 
> Oh, and you might also want to check that you're actually getting SSE
> code; if you accidentally disable SSE somehow, you'll end up with x87
> code, which will completely expand the vectors into scalars.
> 
> -Eli

Using  <4 x float> types:

faust -llvm -vec -vs 4 -lv 1  plus.dsp -o plus_vec.ll

define void @llvm_compute(%struct.llvm_dsp* noalias %obj, i32 %count, <4 x float>** noalias %inputs, <4 x float>** noalias %outputs) nounwind readnone ssp {
		entry:
		    %input_array_ptr0 = getelementptr inbounds <4 x float>** %inputs, i64 0
		    %input0 = load <4 x float>** %input_array_ptr0
		    %input_array_ptr1 = getelementptr inbounds <4 x float>** %inputs, i64 1
		    %input1 = load <4 x float>** %input_array_ptr1
		    %input_array_ptr2 = getelementptr inbounds <4 x float>** %inputs, i64 2
		    %input2 = load <4 x float>** %input_array_ptr2
		    %input_array_ptr3 = getelementptr inbounds <4 x float>** %inputs, i64 3
		    %input3 = load <4 x float>** %input_array_ptr3
		    %output_array_ptr0 = getelementptr inbounds <4 x float>** %outputs, i64 0
		    %output0 = load <4 x float>** %output_array_ptr0
		    %out = icmp sgt i32 %count, 0
		    br i1 %out, label %convert, label %return
		convert:
			%count_64 = zext i32 %count to i64
			br label %loop0
		loop0:
			%indvar = phi i64 [ 0, %convert ], [ %indvar.next, %loop0 ]
			%output_ptr0 = getelementptr <4 x float>* %output0, i64 %indvar
			%input_ptr1 = getelementptr <4 x float>* %input1, i64 %indvar
			%fVector0 = load <4 x float>* %input_ptr1, align 16;
			%input_ptr0 = getelementptr <4 x float>* %input0, i64 %indvar
			%fVector1 = load <4 x float>* %input_ptr0, align 16;
			%fVector2 = fadd <4 x float> %fVector1, %fVector0;
			%input_ptr3 = getelementptr <4 x float>* %input3, i64 %indvar
			%fVector3 = load <4 x float>* %input_ptr3, align 16;
			%input_ptr2 = getelementptr <4 x float>* %input2, i64 %indvar
			%fVector4 = load <4 x float>* %input_ptr2, align 16;
			%fVector5 = fadd <4 x float> %fVector4, %fVector3;
			%fVector6 = fmul <4 x float> %fVector5, %fVector2;
			store <4 x float> %fVector6, <4 x float>* %output_ptr0, align 16

			%indvar.next = add i64 %indvar, 1
			%exitcond = icmp eq i64 %indvar.next, %count_64
			br i1 %exitcond, label %return, label %loop0
		return:
			ret void
}

Then llc -O3 plus_vec.ll gives :

_llvm_compute:                                              ## @llvm_compute
## BB#0:                                                    ## %entry
	testl	%esi, %esi
	jle	LBB8_3
## BB#1:                                                    ## %convert
	movq	(%rcx), %rax
	movq	24(%rdx), %rcx
	movq	16(%rdx), %rdi
	movq	8(%rdx), %r8
	movq	(%rdx), %rdx
	xorl	%r9d, %r9d
	movl	%esi, %esi
	.align	4, 0x90
LBB8_2:                                                     ## %loop0
                                                            ## Loop Depth 1
                                                            ## Loop Header
                                                            ## Inner Loop
	movaps	(%rdx,%r9), %xmm0
	movaps	(%rdi,%r9), %xmm1
	addps	(%r8,%r9), %xmm0
	addps	(%rcx,%r9), %xmm1
	mulps	%xmm0, %xmm1
	movaps	%xmm1, (%rax,%r9)
	addq	$16, %r9
	decq	%rsi
	jne	LBB8_2
LBB8_3:                                                     ## %return
	ret

So generated code seems correct.

Thanks

Stéphane Letz