[LLVMdev] bb-vectorizer transforms only part of the block
Hal Finkel
hfinkel at anl.gov
Mon Jun 22 12:56:17 PDT 2015
----- Original Message -----
> From: "Frank Winter" <fwinter at jlab.org>
> To: llvmdev at cs.uiuc.edu
> Sent: Monday, June 22, 2015 2:50:32 PM
> Subject: [LLVMdev] bb-vectorizer transforms only part of the block
>
> The loads, stores and float arithmetic in attached function should be
> completely vectorizable. The bb-vectorizer does a good job at first,
> but
> from instruction %96 on it messes up by adding unnecessary
> vectorshuffles. (The function was designed so that no shuffle would
> be
> needed in order to vectorize it).
>
> I tested this with llvm 3.6 with the following command:
>
> ~/toolchain/install/llvm-3.6/bin/opt -basicaa -bb-vectorize
> -instcombine
> -S < func_vec_8x8_complex_mul.ll
>
> See below for the output which I truncated after a few
> vectorshuffles.
>
> Is there a general limitation to bb-vectorize in terms of a maximum
> number of instructions that can be vectorized? Are there any 'magic'
> numbers in the pass that can be tweaked?
There are several such magic numbers, but first, why are you using bb-vectorize instead of the SLP vectorizer?
-Hal
>
> Thanks,
> Frank
>
>
> ; ModuleID = '<stdin>'
> target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-unknown-linux-gnu"
>
> define void @main(i64 %lo, i64 %hi, float* noalias %arg0, float*
> noalias
> %arg1, float* noalias %arg2) {
> entrypoint:
> %0 = getelementptr float* %arg1, i64 64
> %1 = getelementptr float* %arg2, i64 64
> %2 = getelementptr float* %arg0, i64 64
> %3 = bitcast float* %arg1 to <4 x float>*
> %4 = bitcast float* %0 to <4 x float>*
> %5 = bitcast float* %arg2 to <4 x float>*
> %6 = bitcast float* %1 to <4 x float>*
> %7 = load <4 x float>* %3, align 16
> %8 = load <4 x float>* %4, align 16
> %9 = load <4 x float>* %5, align 16
> %10 = load <4 x float>* %6, align 16
> %11 = fmul <4 x float> %10, %7
> %12 = fmul <4 x float> %9, %8
> %13 = fadd <4 x float> %12, %11
> %14 = bitcast float* %2 to <4 x float>*
> %15 = fmul <4 x float> %10, %8
> %16 = fmul <4 x float> %9, %7
> %17 = fsub <4 x float> %16, %15
> %18 = bitcast float* %arg0 to <4 x float>*
> store <4 x float> %17, <4 x float>* %18, align 16
> store <4 x float> %13, <4 x float>* %14, align 16
> %19 = getelementptr float* %arg1, i64 4
> %20 = getelementptr float* %arg1, i64 68
> %21 = getelementptr float* %arg2, i64 4
> %22 = getelementptr float* %arg2, i64 68
> %23 = getelementptr float* %arg0, i64 4
> %24 = getelementptr float* %arg0, i64 68
> %25 = bitcast float* %19 to <4 x float>*
> %26 = bitcast float* %20 to <4 x float>*
> %27 = bitcast float* %21 to <4 x float>*
> %28 = bitcast float* %22 to <4 x float>*
> %29 = load <4 x float>* %25, align 16
> %30 = load <4 x float>* %26, align 16
> %31 = load <4 x float>* %27, align 16
> %32 = load <4 x float>* %28, align 16
> %33 = fmul <4 x float> %32, %29
> %34 = fmul <4 x float> %31, %30
> %35 = fadd <4 x float> %34, %33
> %36 = bitcast float* %24 to <4 x float>*
> %37 = fmul <4 x float> %32, %30
> %38 = fmul <4 x float> %31, %29
> %39 = fsub <4 x float> %38, %37
> %40 = bitcast float* %23 to <4 x float>*
> store <4 x float> %39, <4 x float>* %40, align 16
> store <4 x float> %35, <4 x float>* %36, align 16
> %41 = getelementptr float* %arg1, i64 8
> %42 = getelementptr float* %arg1, i64 72
> %43 = getelementptr float* %arg2, i64 8
> %44 = getelementptr float* %arg2, i64 72
> %45 = getelementptr float* %arg0, i64 8
> %46 = getelementptr float* %arg0, i64 72
> %47 = bitcast float* %41 to <4 x float>*
> %48 = bitcast float* %42 to <4 x float>*
> %49 = bitcast float* %43 to <4 x float>*
> %50 = bitcast float* %44 to <4 x float>*
> %51 = load <4 x float>* %47, align 16
> %52 = load <4 x float>* %48, align 16
> %53 = load <4 x float>* %49, align 16
> %54 = load <4 x float>* %50, align 16
> %55 = fmul <4 x float> %54, %51
> %56 = fmul <4 x float> %53, %52
> %57 = fadd <4 x float> %56, %55
> %58 = bitcast float* %46 to <4 x float>*
> %59 = fmul <4 x float> %54, %52
> %60 = fmul <4 x float> %53, %51
> %61 = fsub <4 x float> %60, %59
> %62 = bitcast float* %45 to <4 x float>*
> store <4 x float> %61, <4 x float>* %62, align 16
> store <4 x float> %57, <4 x float>* %58, align 16
> %63 = getelementptr float* %arg1, i64 12
> %64 = getelementptr float* %arg1, i64 76
> %65 = getelementptr float* %arg2, i64 12
> %66 = getelementptr float* %arg2, i64 76
> %67 = getelementptr float* %arg0, i64 12
> %68 = getelementptr float* %arg0, i64 76
> %69 = bitcast float* %63 to <4 x float>*
> %70 = bitcast float* %64 to <4 x float>*
> %71 = bitcast float* %65 to <4 x float>*
> %72 = bitcast float* %66 to <4 x float>*
> %73 = load <4 x float>* %69, align 16
> %74 = load <4 x float>* %70, align 16
> %75 = load <4 x float>* %71, align 16
> %76 = load <4 x float>* %72, align 16
> %77 = fmul <4 x float> %76, %73
> %78 = fmul <4 x float> %75, %74
> %79 = fadd <4 x float> %78, %77
> %80 = bitcast float* %68 to <4 x float>*
> %81 = fmul <4 x float> %76, %74
> %82 = fmul <4 x float> %75, %73
> %83 = fsub <4 x float> %82, %81
> %84 = bitcast float* %67 to <4 x float>*
> store <4 x float> %83, <4 x float>* %84, align 16
> store <4 x float> %79, <4 x float>* %80, align 16
> %85 = getelementptr float* %arg1, i64 16
> %86 = getelementptr float* %arg1, i64 80
> %87 = getelementptr float* %arg2, i64 16
> %88 = getelementptr float* %arg2, i64 80
> %89 = getelementptr float* %arg0, i64 16
> %90 = getelementptr float* %arg0, i64 80
> %91 = bitcast float* %85 to <4 x float>*
> %92 = bitcast float* %86 to <4 x float>*
> %93 = bitcast float* %87 to <4 x float>*
> %94 = bitcast float* %88 to <4 x float>*
> %95 = load <4 x float>* %91, align 16
> %96 = shufflevector <4 x float> %95, <4 x float> undef, <2 x i32>
> <i32 0, i32 1>
> %97 = load <4 x float>* %92, align 16
> %98 = shufflevector <4 x float> %97, <4 x float> undef, <2 x i32>
> <i32 0, i32 1>
> %99 = shufflevector <2 x float> %98, <2 x float> %96, <4 x i32>
> <i32
> 0, i32 1, i32 2, i32 3>
> %100 = load <4 x float>* %93, align 16
> %101 = shufflevector <4 x float> %100, <4 x float> undef, <2 x
> i32>
> <i32 0, i32 1>
> ....
>
>
>
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
>
--
Hal Finkel
Assistant Computational Scientist
Leadership Computing Facility
Argonne National Laboratory
More information about the llvm-dev
mailing list