[LLVMdev] bb-vectorizer transforms only part of the block

Mon Jun 22 12:56:17 PDT 2015

----- Original Message -----
> From: "Frank Winter" <fwinter at jlab.org>
> To: llvmdev at cs.uiuc.edu
> Sent: Monday, June 22, 2015 2:50:32 PM
> Subject: [LLVMdev] bb-vectorizer transforms only part of the block
> 
> The loads, stores and float arithmetic in attached function should be
> completely vectorizable. The bb-vectorizer does a good job at first,
> but
> from instruction %96 on it messes up by adding unnecessary
> vectorshuffles. (The function was designed so that no shuffle would
> be
> needed in order to vectorize it).
> 
> I tested this with llvm 3.6 with the following command:
> 
> ~/toolchain/install/llvm-3.6/bin/opt -basicaa -bb-vectorize
> -instcombine
> -S < func_vec_8x8_complex_mul.ll
> 
> See below for the output which I truncated after a few
> vectorshuffles.
> 
> Is there a general limitation to bb-vectorize in terms of a maximum
> number of instructions that can be vectorized? Are there any 'magic'
> numbers in the pass that can be tweaked?

There are several such magic numbers, but first, why are you using bb-vectorize instead of the SLP vectorizer?

 -Hal

> 
> Thanks,
> Frank
> 
> 
> ; ModuleID = '<stdin>'
> target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-unknown-linux-gnu"
> 
> define void @main(i64 %lo, i64 %hi, float* noalias %arg0, float*
> noalias
> %arg1, float* noalias %arg2) {
> entrypoint:
>    %0 = getelementptr float* %arg1, i64 64
>    %1 = getelementptr float* %arg2, i64 64
>    %2 = getelementptr float* %arg0, i64 64
>    %3 = bitcast float* %arg1 to <4 x float>*
>    %4 = bitcast float* %0 to <4 x float>*
>    %5 = bitcast float* %arg2 to <4 x float>*
>    %6 = bitcast float* %1 to <4 x float>*
>    %7 = load <4 x float>* %3, align 16
>    %8 = load <4 x float>* %4, align 16
>    %9 = load <4 x float>* %5, align 16
>    %10 = load <4 x float>* %6, align 16
>    %11 = fmul <4 x float> %10, %7
>    %12 = fmul <4 x float> %9, %8
>    %13 = fadd <4 x float> %12, %11
>    %14 = bitcast float* %2 to <4 x float>*
>    %15 = fmul <4 x float> %10, %8
>    %16 = fmul <4 x float> %9, %7
>    %17 = fsub <4 x float> %16, %15
>    %18 = bitcast float* %arg0 to <4 x float>*
>    store <4 x float> %17, <4 x float>* %18, align 16
>    store <4 x float> %13, <4 x float>* %14, align 16
>    %19 = getelementptr float* %arg1, i64 4
>    %20 = getelementptr float* %arg1, i64 68
>    %21 = getelementptr float* %arg2, i64 4
>    %22 = getelementptr float* %arg2, i64 68
>    %23 = getelementptr float* %arg0, i64 4
>    %24 = getelementptr float* %arg0, i64 68
>    %25 = bitcast float* %19 to <4 x float>*
>    %26 = bitcast float* %20 to <4 x float>*
>    %27 = bitcast float* %21 to <4 x float>*
>    %28 = bitcast float* %22 to <4 x float>*
>    %29 = load <4 x float>* %25, align 16
>    %30 = load <4 x float>* %26, align 16
>    %31 = load <4 x float>* %27, align 16
>    %32 = load <4 x float>* %28, align 16
>    %33 = fmul <4 x float> %32, %29
>    %34 = fmul <4 x float> %31, %30
>    %35 = fadd <4 x float> %34, %33
>    %36 = bitcast float* %24 to <4 x float>*
>    %37 = fmul <4 x float> %32, %30
>    %38 = fmul <4 x float> %31, %29
>    %39 = fsub <4 x float> %38, %37
>    %40 = bitcast float* %23 to <4 x float>*
>    store <4 x float> %39, <4 x float>* %40, align 16
>    store <4 x float> %35, <4 x float>* %36, align 16
>    %41 = getelementptr float* %arg1, i64 8
>    %42 = getelementptr float* %arg1, i64 72
>    %43 = getelementptr float* %arg2, i64 8
>    %44 = getelementptr float* %arg2, i64 72
>    %45 = getelementptr float* %arg0, i64 8
>    %46 = getelementptr float* %arg0, i64 72
>    %47 = bitcast float* %41 to <4 x float>*
>    %48 = bitcast float* %42 to <4 x float>*
>    %49 = bitcast float* %43 to <4 x float>*
>    %50 = bitcast float* %44 to <4 x float>*
>    %51 = load <4 x float>* %47, align 16
>    %52 = load <4 x float>* %48, align 16
>    %53 = load <4 x float>* %49, align 16
>    %54 = load <4 x float>* %50, align 16
>    %55 = fmul <4 x float> %54, %51
>    %56 = fmul <4 x float> %53, %52
>    %57 = fadd <4 x float> %56, %55
>    %58 = bitcast float* %46 to <4 x float>*
>    %59 = fmul <4 x float> %54, %52
>    %60 = fmul <4 x float> %53, %51
>    %61 = fsub <4 x float> %60, %59
>    %62 = bitcast float* %45 to <4 x float>*
>    store <4 x float> %61, <4 x float>* %62, align 16
>    store <4 x float> %57, <4 x float>* %58, align 16
>    %63 = getelementptr float* %arg1, i64 12
>    %64 = getelementptr float* %arg1, i64 76
>    %65 = getelementptr float* %arg2, i64 12
>    %66 = getelementptr float* %arg2, i64 76
>    %67 = getelementptr float* %arg0, i64 12
>    %68 = getelementptr float* %arg0, i64 76
>    %69 = bitcast float* %63 to <4 x float>*
>    %70 = bitcast float* %64 to <4 x float>*
>    %71 = bitcast float* %65 to <4 x float>*
>    %72 = bitcast float* %66 to <4 x float>*
>    %73 = load <4 x float>* %69, align 16
>    %74 = load <4 x float>* %70, align 16
>    %75 = load <4 x float>* %71, align 16
>    %76 = load <4 x float>* %72, align 16
>    %77 = fmul <4 x float> %76, %73
>    %78 = fmul <4 x float> %75, %74
>    %79 = fadd <4 x float> %78, %77
>    %80 = bitcast float* %68 to <4 x float>*
>    %81 = fmul <4 x float> %76, %74
>    %82 = fmul <4 x float> %75, %73
>    %83 = fsub <4 x float> %82, %81
>    %84 = bitcast float* %67 to <4 x float>*
>    store <4 x float> %83, <4 x float>* %84, align 16
>    store <4 x float> %79, <4 x float>* %80, align 16
>    %85 = getelementptr float* %arg1, i64 16
>    %86 = getelementptr float* %arg1, i64 80
>    %87 = getelementptr float* %arg2, i64 16
>    %88 = getelementptr float* %arg2, i64 80
>    %89 = getelementptr float* %arg0, i64 16
>    %90 = getelementptr float* %arg0, i64 80
>    %91 = bitcast float* %85 to <4 x float>*
>    %92 = bitcast float* %86 to <4 x float>*
>    %93 = bitcast float* %87 to <4 x float>*
>    %94 = bitcast float* %88 to <4 x float>*
>    %95 = load <4 x float>* %91, align 16
>    %96 = shufflevector <4 x float> %95, <4 x float> undef, <2 x i32>
> <i32 0, i32 1>
>    %97 = load <4 x float>* %92, align 16
>    %98 = shufflevector <4 x float> %97, <4 x float> undef, <2 x i32>
> <i32 0, i32 1>
>    %99 = shufflevector <2 x float> %98, <2 x float> %96, <4 x i32>
>    <i32
> 0, i32 1, i32 2, i32 3>
>    %100 = load <4 x float>* %93, align 16
>    %101 = shufflevector <4 x float> %100, <4 x float> undef, <2 x
>    i32>
> <i32 0, i32 1>
> ....
> 
> 
> 
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu         http://llvm.cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
> 

-- 
Hal Finkel
Assistant Computational Scientist
Leadership Computing Facility
Argonne National Laboratory