[LLVMdev] bb-vectorizer transforms only part of the block

Mon Jun 22 12:58:22 PDT 2015

On 06/22/2015 03:56 PM, Hal Finkel wrote:
> ----- Original Message -----
>> From: "Frank Winter" <fwinter at jlab.org>
>> To: llvmdev at cs.uiuc.edu
>> Sent: Monday, June 22, 2015 2:50:32 PM
>> Subject: [LLVMdev] bb-vectorizer transforms only part of the block
>>
>> The loads, stores and float arithmetic in attached function should be
>> completely vectorizable. The bb-vectorizer does a good job at first,
>> but
>> from instruction %96 on it messes up by adding unnecessary
>> vectorshuffles. (The function was designed so that no shuffle would
>> be
>> needed in order to vectorize it).
>>
>> I tested this with llvm 3.6 with the following command:
>>
>> ~/toolchain/install/llvm-3.6/bin/opt -basicaa -bb-vectorize
>> -instcombine
>> -S < func_vec_8x8_complex_mul.ll
>>
>> See below for the output which I truncated after a few
>> vectorshuffles.
>>
>> Is there a general limitation to bb-vectorize in terms of a maximum
>> number of instructions that can be vectorized? Are there any 'magic'
>> numbers in the pass that can be tweaked?

> There are several such magic numbers, but first, why are you using bb-vectorize instead of the SLP vectorizer?
Oh, good catch! Using the SLP vectorizer fixed this. All vectorized now :-)

Frank

>   -Hal
>
>> Thanks,
>> Frank
>>
>>
>> ; ModuleID = '<stdin>'
>> target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
>> target triple = "x86_64-unknown-linux-gnu"
>>
>> define void @main(i64 %lo, i64 %hi, float* noalias %arg0, float*
>> noalias
>> %arg1, float* noalias %arg2) {
>> entrypoint:
>>     %0 = getelementptr float* %arg1, i64 64
>>     %1 = getelementptr float* %arg2, i64 64
>>     %2 = getelementptr float* %arg0, i64 64
>>     %3 = bitcast float* %arg1 to <4 x float>*
>>     %4 = bitcast float* %0 to <4 x float>*
>>     %5 = bitcast float* %arg2 to <4 x float>*
>>     %6 = bitcast float* %1 to <4 x float>*
>>     %7 = load <4 x float>* %3, align 16
>>     %8 = load <4 x float>* %4, align 16
>>     %9 = load <4 x float>* %5, align 16
>>     %10 = load <4 x float>* %6, align 16
>>     %11 = fmul <4 x float> %10, %7
>>     %12 = fmul <4 x float> %9, %8
>>     %13 = fadd <4 x float> %12, %11
>>     %14 = bitcast float* %2 to <4 x float>*
>>     %15 = fmul <4 x float> %10, %8
>>     %16 = fmul <4 x float> %9, %7
>>     %17 = fsub <4 x float> %16, %15
>>     %18 = bitcast float* %arg0 to <4 x float>*
>>     store <4 x float> %17, <4 x float>* %18, align 16
>>     store <4 x float> %13, <4 x float>* %14, align 16
>>     %19 = getelementptr float* %arg1, i64 4
>>     %20 = getelementptr float* %arg1, i64 68
>>     %21 = getelementptr float* %arg2, i64 4
>>     %22 = getelementptr float* %arg2, i64 68
>>     %23 = getelementptr float* %arg0, i64 4
>>     %24 = getelementptr float* %arg0, i64 68
>>     %25 = bitcast float* %19 to <4 x float>*
>>     %26 = bitcast float* %20 to <4 x float>*
>>     %27 = bitcast float* %21 to <4 x float>*
>>     %28 = bitcast float* %22 to <4 x float>*
>>     %29 = load <4 x float>* %25, align 16
>>     %30 = load <4 x float>* %26, align 16
>>     %31 = load <4 x float>* %27, align 16
>>     %32 = load <4 x float>* %28, align 16
>>     %33 = fmul <4 x float> %32, %29
>>     %34 = fmul <4 x float> %31, %30
>>     %35 = fadd <4 x float> %34, %33
>>     %36 = bitcast float* %24 to <4 x float>*
>>     %37 = fmul <4 x float> %32, %30
>>     %38 = fmul <4 x float> %31, %29
>>     %39 = fsub <4 x float> %38, %37
>>     %40 = bitcast float* %23 to <4 x float>*
>>     store <4 x float> %39, <4 x float>* %40, align 16
>>     store <4 x float> %35, <4 x float>* %36, align 16
>>     %41 = getelementptr float* %arg1, i64 8
>>     %42 = getelementptr float* %arg1, i64 72
>>     %43 = getelementptr float* %arg2, i64 8
>>     %44 = getelementptr float* %arg2, i64 72
>>     %45 = getelementptr float* %arg0, i64 8
>>     %46 = getelementptr float* %arg0, i64 72
>>     %47 = bitcast float* %41 to <4 x float>*
>>     %48 = bitcast float* %42 to <4 x float>*
>>     %49 = bitcast float* %43 to <4 x float>*
>>     %50 = bitcast float* %44 to <4 x float>*
>>     %51 = load <4 x float>* %47, align 16
>>     %52 = load <4 x float>* %48, align 16
>>     %53 = load <4 x float>* %49, align 16
>>     %54 = load <4 x float>* %50, align 16
>>     %55 = fmul <4 x float> %54, %51
>>     %56 = fmul <4 x float> %53, %52
>>     %57 = fadd <4 x float> %56, %55
>>     %58 = bitcast float* %46 to <4 x float>*
>>     %59 = fmul <4 x float> %54, %52
>>     %60 = fmul <4 x float> %53, %51
>>     %61 = fsub <4 x float> %60, %59
>>     %62 = bitcast float* %45 to <4 x float>*
>>     store <4 x float> %61, <4 x float>* %62, align 16
>>     store <4 x float> %57, <4 x float>* %58, align 16
>>     %63 = getelementptr float* %arg1, i64 12
>>     %64 = getelementptr float* %arg1, i64 76
>>     %65 = getelementptr float* %arg2, i64 12
>>     %66 = getelementptr float* %arg2, i64 76
>>     %67 = getelementptr float* %arg0, i64 12
>>     %68 = getelementptr float* %arg0, i64 76
>>     %69 = bitcast float* %63 to <4 x float>*
>>     %70 = bitcast float* %64 to <4 x float>*
>>     %71 = bitcast float* %65 to <4 x float>*
>>     %72 = bitcast float* %66 to <4 x float>*
>>     %73 = load <4 x float>* %69, align 16
>>     %74 = load <4 x float>* %70, align 16
>>     %75 = load <4 x float>* %71, align 16
>>     %76 = load <4 x float>* %72, align 16
>>     %77 = fmul <4 x float> %76, %73
>>     %78 = fmul <4 x float> %75, %74
>>     %79 = fadd <4 x float> %78, %77
>>     %80 = bitcast float* %68 to <4 x float>*
>>     %81 = fmul <4 x float> %76, %74
>>     %82 = fmul <4 x float> %75, %73
>>     %83 = fsub <4 x float> %82, %81
>>     %84 = bitcast float* %67 to <4 x float>*
>>     store <4 x float> %83, <4 x float>* %84, align 16
>>     store <4 x float> %79, <4 x float>* %80, align 16
>>     %85 = getelementptr float* %arg1, i64 16
>>     %86 = getelementptr float* %arg1, i64 80
>>     %87 = getelementptr float* %arg2, i64 16
>>     %88 = getelementptr float* %arg2, i64 80
>>     %89 = getelementptr float* %arg0, i64 16
>>     %90 = getelementptr float* %arg0, i64 80
>>     %91 = bitcast float* %85 to <4 x float>*
>>     %92 = bitcast float* %86 to <4 x float>*
>>     %93 = bitcast float* %87 to <4 x float>*
>>     %94 = bitcast float* %88 to <4 x float>*
>>     %95 = load <4 x float>* %91, align 16
>>     %96 = shufflevector <4 x float> %95, <4 x float> undef, <2 x i32>
>> <i32 0, i32 1>
>>     %97 = load <4 x float>* %92, align 16
>>     %98 = shufflevector <4 x float> %97, <4 x float> undef, <2 x i32>
>> <i32 0, i32 1>
>>     %99 = shufflevector <2 x float> %98, <2 x float> %96, <4 x i32>
>>     <i32
>> 0, i32 1, i32 2, i32 3>
>>     %100 = load <4 x float>* %93, align 16
>>     %101 = shufflevector <4 x float> %100, <4 x float> undef, <2 x
>>     i32>
>> <i32 0, i32 1>
>> ....
>>
>>
>>
>> _______________________________________________
>> LLVM Developers mailing list
>> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
>>