[llvm-dev] AVX512 instruction generated when JIT compiling for an avx2 architecture

Thu Jun 23 10:07:14 PDT 2016

You likely haven't set the cpu features correctly. See
llvm::sys::getHostCPUFeatures. E.g. this is what we're doing in julia:
https://github.com/JuliaLang/julia/blob/59b253031af87f62e7d70a7d8848cdfd4a84288b/src/codegen.cpp#L5627

On Thu, Jun 23, 2016 at 1:00 PM, Frank Winter via llvm-dev
<llvm-dev at lists.llvm.org> wrote:
>
>
>
> On 06/23/2016 12:56 PM, Craig Topper wrote:
>
> Can you check what value "getHostCPUName" returned?
>
> getHostCPUName() = skylake
>
>
> On Thu, Jun 23, 2016 at 9:53 AM, Frank Winter via llvm-dev <llvm-dev at lists.llvm.org> wrote:
>>
>> With LLVM 3.8 the JIT compiler engine generates an AVX512 instruction although I target an 'avx2' CPU (intel Core I7).
>> I just downloaded the most recent 3.8 and still it happens.
>>
>> It happens with this input module:
>>
>>
>> target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
>>
>> define void @module_cFFEMJ(i64 %lo, i64 %hi, i64 %myId, i1 %ordered, i64 %start, i32* noalias align 32 %arg0, i32* noalias align 32 %arg1) {
>> entrypoint:
>>   %0 = add nsw i64 %lo, %start
>>   %1 = add nsw i64 %hi, %start
>>   %2 = select i1 %ordered, i64 %0, i64 %lo
>>   %3 = select i1 %ordered, i64 %1, i64 %hi
>>   %4 = sdiv i64 %2, 4
>>   %5 = sdiv i64 %3, 4
>>   %6 = bitcast i32* %arg1 to i64*
>>   %7 = load i64, i64* %6, align 32
>>   %8 = trunc i64 %7 to i32
>>   %9 = getelementptr i32, i32* %arg1, i64 1
>>   %10 = lshr i64 %7, 32
>>   %11 = trunc i64 %10 to i32
>>   %12 = getelementptr i32, i32* %arg1, i64 2
>>   %13 = bitcast i32* %12 to i64*
>>   %14 = load i64, i64* %13, align 8
>>   %15 = trunc i64 %14 to i32
>>   %16 = getelementptr i32, i32* %arg1, i64 3
>>   %17 = lshr i64 %14, 32
>>   %18 = trunc i64 %17 to i32
>>   br label %L5
>>
>> L5:                                               ; preds = %L5, %entrypoint
>>   %19 = phi i64 [ %32, %L5 ], [ %4, %entrypoint ]
>>   %20 = shl i64 %19, 4
>>   %21 = or i64 %20, 4
>>   %22 = or i64 %20, 8
>>   %23 = or i64 %20, 12
>>   %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %8, i32 0
>>   %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
>>   %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %11, i32 0
>>   %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
>>   %broadcast.splatinsert13 = insertelement <4 x i32> undef, i32 %15, i32 0
>>   %broadcast.splat14 = shufflevector <4 x i32> %broadcast.splatinsert13, <4 x i32> undef, <4 x i32> zeroinitializer
>>   %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %18, i32 0
>>   %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
>>   %24 = getelementptr i32, i32* %arg0, i64 %20
>>   %25 = bitcast i32* %24 to <4 x i32>*
>>   store <4 x i32> %broadcast.splat10, <4 x i32>* %25, align 16
>>   %26 = getelementptr i32, i32* %arg0, i64 %21
>>   %27 = bitcast i32* %26 to <4 x i32>*
>>   store <4 x i32> %broadcast.splat12, <4 x i32>* %27, align 16
>>   %28 = getelementptr i32, i32* %arg0, i64 %22
>>   %29 = bitcast i32* %28 to <4 x i32>*
>>   store <4 x i32> %broadcast.splat14, <4 x i32>* %29, align 16
>>   %30 = getelementptr i32, i32* %arg0, i64 %23
>>   %31 = bitcast i32* %30 to <4 x i32>*
>>   store <4 x i32> %broadcast.splat16, <4 x i32>* %31, align 16
>>   %32 = add nsw i64 %19, 1
>>   %33 = icmp slt i64 %32, %5
>>   br i1 %33, label %L5, label %L6
>>
>> L6:                                               ; preds = %L5
>>   ret void
>> }
>>
>>
>> The following code line show how I call the JIT compiler. ('Mod' is pointing to the module).
>>
>> llvm::EngineBuilder engineBuilder(std::move(std::unique_ptr<llvm::Module>(Mod)));
>> engineBuilder.setMCPU(llvm::sys::getHostCPUName());
>> engineBuilder.setEngineKind(llvm::EngineKind::JIT);
>> engineBuilder.setOptLevel(llvm::CodeGenOpt::Aggressive);
>> engineBuilder.setErrorStr(&mcjit_error);
>>
>> llvm::TargetOptions targetOptions;
>> targetOptions.AllowFPOpFusion = llvm::FPOpFusion::Fast;
>> engineBuilder.setTargetOptions( targetOptions );
>>
>> TheExecutionEngine = engineBuilder.create();
>>
>> targetMachine = engineBuilder.selectTarget();
>> Mod->setDataLayout( targetMachine->createDataLayout() );
>>
>> TheExecutionEngine->finalizeObject();  // MCJIT
>> fptr_mainFunc_extern = TheExecutionEngine->getPointerToFunction( mainFunc_extern );
>>
>>
>> When calling the function an 'illegal instruction' is raised.
>> Looking at the assembler reveals an AVX512 instruction which shouldn't be there.
>>
>> Assembly:
>>     .text
>>     .file    "module"
>>     .globl    main
>>     .align    16, 0x90
>>     .type    main, at function
>> main:
>>     .cfi_startproc
>>     movq    8(%rsp), %r10
>>     leaq    (%rdi,%r8), %rdx
>>     addq    %rsi, %r8
>>     testb    $1, %cl
>>     cmoveq    %rdi, %rdx
>>     cmoveq    %rsi, %r8
>>     movq    %rdx, %rax
>>     sarq    $63, %rax
>>     shrq    $62, %rax
>>     addq    %rdx, %rax
>>     sarq    $2, %rax
>>     movq    %r8, %rcx
>>     sarq    $63, %rcx
>>     shrq    $62, %rcx
>>     addq    %r8, %rcx
>>     sarq    $2, %rcx
>>     movq    (%r10), %r8
>>     movq    8(%r10), %r10
>>     movq    %r8, %rdi
>>     shrq    $32, %rdi
>>     movq    %r10, %rsi
>>     shrq    $32, %rsi
>>     movq    %rax, %rdx
>>     shlq    $6, %rdx
>>     leaq    48(%rdx,%r9), %rdx
>>     .align    16, 0x90
>> .LBB0_1:
>>     vmovd    %r8d, %xmm0
>>     vpbroadcastd    %xmm0, %xmm0
>>     vmovd    %edi, %xmm1
>>     vpbroadcastd    %xmm1, %xmm1
>>     vmovd    %r10d, %xmm2
>>     vpbroadcastd    %xmm2, %xmm2
>>     vmovd    %esi, %xmm3
>>     vpbroadcastd    %xmm3, %xmm3
>>     vmovdqa32    %xmm0, -48(%rdx)
>>     vmovdqa32    %xmm1, -32(%rdx)
>>     vmovdqa32    %xmm2, -16(%rdx)
>>     vmovdqa32    %xmm3, (%rdx)
>>     addq    $1, %rax
>>     addq    $64, %rdx
>>     cmpq    %rcx, %rax
>>     jl    .LBB0_1
>>     retq
>> .Lfunc_end0:
>>     .size    main, .Lfunc_end0-main
>>     .cfi_endproc
>>
>>
>>     .section    ".note.GNU-stack","", at progbits
>>
>> end assembly!
>>
>> I am not sure what instruction is the offending one, but the 'vmovdqa32' looks avx512.
>>
>> I wasn't able to reproduce this with 'opt' - it generates avx2 instructions. And when I force it to use e.g. avx512f it rejects the CPU type.
>>
>> Any ideas?
>>
>>
>> Frank
>> _______________________________________________
>> LLVM Developers mailing list
>> llvm-dev at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>
>
>
>
> --
> ~Craig
>
>
>
> _______________________________________________
> LLVM Developers mailing list
> llvm-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>