[LLVMdev] bb-vectorizer transforms only part of the block
Frank Winter
fwinter at jlab.org
Mon Jun 22 12:58:22 PDT 2015
On 06/22/2015 03:56 PM, Hal Finkel wrote:
> ----- Original Message -----
>> From: "Frank Winter" <fwinter at jlab.org>
>> To: llvmdev at cs.uiuc.edu
>> Sent: Monday, June 22, 2015 2:50:32 PM
>> Subject: [LLVMdev] bb-vectorizer transforms only part of the block
>>
>> The loads, stores and float arithmetic in attached function should be
>> completely vectorizable. The bb-vectorizer does a good job at first,
>> but
>> from instruction %96 on it messes up by adding unnecessary
>> vectorshuffles. (The function was designed so that no shuffle would
>> be
>> needed in order to vectorize it).
>>
>> I tested this with llvm 3.6 with the following command:
>>
>> ~/toolchain/install/llvm-3.6/bin/opt -basicaa -bb-vectorize
>> -instcombine
>> -S < func_vec_8x8_complex_mul.ll
>>
>> See below for the output which I truncated after a few
>> vectorshuffles.
>>
>> Is there a general limitation to bb-vectorize in terms of a maximum
>> number of instructions that can be vectorized? Are there any 'magic'
>> numbers in the pass that can be tweaked?
> There are several such magic numbers, but first, why are you using bb-vectorize instead of the SLP vectorizer?
Oh, good catch! Using the SLP vectorizer fixed this. All vectorized now :-)
Frank
> -Hal
>
>> Thanks,
>> Frank
>>
>>
>> ; ModuleID = '<stdin>'
>> target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
>> target triple = "x86_64-unknown-linux-gnu"
>>
>> define void @main(i64 %lo, i64 %hi, float* noalias %arg0, float*
>> noalias
>> %arg1, float* noalias %arg2) {
>> entrypoint:
>> %0 = getelementptr float* %arg1, i64 64
>> %1 = getelementptr float* %arg2, i64 64
>> %2 = getelementptr float* %arg0, i64 64
>> %3 = bitcast float* %arg1 to <4 x float>*
>> %4 = bitcast float* %0 to <4 x float>*
>> %5 = bitcast float* %arg2 to <4 x float>*
>> %6 = bitcast float* %1 to <4 x float>*
>> %7 = load <4 x float>* %3, align 16
>> %8 = load <4 x float>* %4, align 16
>> %9 = load <4 x float>* %5, align 16
>> %10 = load <4 x float>* %6, align 16
>> %11 = fmul <4 x float> %10, %7
>> %12 = fmul <4 x float> %9, %8
>> %13 = fadd <4 x float> %12, %11
>> %14 = bitcast float* %2 to <4 x float>*
>> %15 = fmul <4 x float> %10, %8
>> %16 = fmul <4 x float> %9, %7
>> %17 = fsub <4 x float> %16, %15
>> %18 = bitcast float* %arg0 to <4 x float>*
>> store <4 x float> %17, <4 x float>* %18, align 16
>> store <4 x float> %13, <4 x float>* %14, align 16
>> %19 = getelementptr float* %arg1, i64 4
>> %20 = getelementptr float* %arg1, i64 68
>> %21 = getelementptr float* %arg2, i64 4
>> %22 = getelementptr float* %arg2, i64 68
>> %23 = getelementptr float* %arg0, i64 4
>> %24 = getelementptr float* %arg0, i64 68
>> %25 = bitcast float* %19 to <4 x float>*
>> %26 = bitcast float* %20 to <4 x float>*
>> %27 = bitcast float* %21 to <4 x float>*
>> %28 = bitcast float* %22 to <4 x float>*
>> %29 = load <4 x float>* %25, align 16
>> %30 = load <4 x float>* %26, align 16
>> %31 = load <4 x float>* %27, align 16
>> %32 = load <4 x float>* %28, align 16
>> %33 = fmul <4 x float> %32, %29
>> %34 = fmul <4 x float> %31, %30
>> %35 = fadd <4 x float> %34, %33
>> %36 = bitcast float* %24 to <4 x float>*
>> %37 = fmul <4 x float> %32, %30
>> %38 = fmul <4 x float> %31, %29
>> %39 = fsub <4 x float> %38, %37
>> %40 = bitcast float* %23 to <4 x float>*
>> store <4 x float> %39, <4 x float>* %40, align 16
>> store <4 x float> %35, <4 x float>* %36, align 16
>> %41 = getelementptr float* %arg1, i64 8
>> %42 = getelementptr float* %arg1, i64 72
>> %43 = getelementptr float* %arg2, i64 8
>> %44 = getelementptr float* %arg2, i64 72
>> %45 = getelementptr float* %arg0, i64 8
>> %46 = getelementptr float* %arg0, i64 72
>> %47 = bitcast float* %41 to <4 x float>*
>> %48 = bitcast float* %42 to <4 x float>*
>> %49 = bitcast float* %43 to <4 x float>*
>> %50 = bitcast float* %44 to <4 x float>*
>> %51 = load <4 x float>* %47, align 16
>> %52 = load <4 x float>* %48, align 16
>> %53 = load <4 x float>* %49, align 16
>> %54 = load <4 x float>* %50, align 16
>> %55 = fmul <4 x float> %54, %51
>> %56 = fmul <4 x float> %53, %52
>> %57 = fadd <4 x float> %56, %55
>> %58 = bitcast float* %46 to <4 x float>*
>> %59 = fmul <4 x float> %54, %52
>> %60 = fmul <4 x float> %53, %51
>> %61 = fsub <4 x float> %60, %59
>> %62 = bitcast float* %45 to <4 x float>*
>> store <4 x float> %61, <4 x float>* %62, align 16
>> store <4 x float> %57, <4 x float>* %58, align 16
>> %63 = getelementptr float* %arg1, i64 12
>> %64 = getelementptr float* %arg1, i64 76
>> %65 = getelementptr float* %arg2, i64 12
>> %66 = getelementptr float* %arg2, i64 76
>> %67 = getelementptr float* %arg0, i64 12
>> %68 = getelementptr float* %arg0, i64 76
>> %69 = bitcast float* %63 to <4 x float>*
>> %70 = bitcast float* %64 to <4 x float>*
>> %71 = bitcast float* %65 to <4 x float>*
>> %72 = bitcast float* %66 to <4 x float>*
>> %73 = load <4 x float>* %69, align 16
>> %74 = load <4 x float>* %70, align 16
>> %75 = load <4 x float>* %71, align 16
>> %76 = load <4 x float>* %72, align 16
>> %77 = fmul <4 x float> %76, %73
>> %78 = fmul <4 x float> %75, %74
>> %79 = fadd <4 x float> %78, %77
>> %80 = bitcast float* %68 to <4 x float>*
>> %81 = fmul <4 x float> %76, %74
>> %82 = fmul <4 x float> %75, %73
>> %83 = fsub <4 x float> %82, %81
>> %84 = bitcast float* %67 to <4 x float>*
>> store <4 x float> %83, <4 x float>* %84, align 16
>> store <4 x float> %79, <4 x float>* %80, align 16
>> %85 = getelementptr float* %arg1, i64 16
>> %86 = getelementptr float* %arg1, i64 80
>> %87 = getelementptr float* %arg2, i64 16
>> %88 = getelementptr float* %arg2, i64 80
>> %89 = getelementptr float* %arg0, i64 16
>> %90 = getelementptr float* %arg0, i64 80
>> %91 = bitcast float* %85 to <4 x float>*
>> %92 = bitcast float* %86 to <4 x float>*
>> %93 = bitcast float* %87 to <4 x float>*
>> %94 = bitcast float* %88 to <4 x float>*
>> %95 = load <4 x float>* %91, align 16
>> %96 = shufflevector <4 x float> %95, <4 x float> undef, <2 x i32>
>> <i32 0, i32 1>
>> %97 = load <4 x float>* %92, align 16
>> %98 = shufflevector <4 x float> %97, <4 x float> undef, <2 x i32>
>> <i32 0, i32 1>
>> %99 = shufflevector <2 x float> %98, <2 x float> %96, <4 x i32>
>> <i32
>> 0, i32 1, i32 2, i32 3>
>> %100 = load <4 x float>* %93, align 16
>> %101 = shufflevector <4 x float> %100, <4 x float> undef, <2 x
>> i32>
>> <i32 0, i32 1>
>> ....
>>
>>
>>
>> _______________________________________________
>> LLVM Developers mailing list
>> LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
>>
More information about the llvm-dev
mailing list