[PATCH] [RFC] Patch for Bug 18808 - clang -O3 -flto drops Target feature flags
Eric Christopher
echristo at gmail.com
Thu Mar 26 16:28:53 PDT 2015
Very much so. Right now, this works without doing this:
dzur:~/tmp> cat bar.c
#include <stdint.h>
#include <stdlib.h>
typedef uint64_t avxreg __attribute__((ext_vector_type(4)));
int main(int argc, char **argv) {
avxreg x=argc,y=argc;
for (int i=0; i<argc; i++) {
int foo = atoi(argv[i]);
x = y + (avxreg)foo;
y = x;
}
return x.x;
}
dzur:~/tmp> ~/builds/build-llvm/Debug+Asserts/bin/clang -O3 -flto -mavx2
bar.c
dzur:~/tmp> llvm-objdump -disassemble a.out | more
main:
400520: 55 pushq %rbp
400521: 53 pushq %rbx
400522: 48 83 ec 28 subq $40, %rsp
400526: 48 89 f3 movq %rsi, %rbx
400529: 89 fd movl %edi, %ebp
40052b: 48 63 c5 movslq %ebp, %rax
40052e: c4 e1 f9 6e c0 vmovq %rax, %xmm0
400533: c4 e2 7d 19 c0 vbroadcastsd %xmm0, %ymm0
400538: 85 c0 testl %eax, %eax
40053a: 7e 42 jle 66
40053c: 0f 1f 40 00 nopl (%rax)
400540: c5 fc 11 04 24 vmovups %ymm0, (%rsp)
400545: 48 8b 3b movq (%rbx), %rdi
400548: 31 f6 xorl %esi, %esi
40054a: ba 0a 00 00 00 movl $10, %edx
40054f: c5 f8 77 vzeroupper
400552: e8 b9 fe ff ff callq -327
400557: 48 98 cltq
400559: c4 e1 f9 6e c0 vmovq %rax, %xmm0
40055e: c4 e2 7d 59 c0 vpbroadcastq %xmm0, %ymm0
400563: c5 fe 6f 0c 24 vmovdqu (%rsp), %ymm1
400568: c5 fd d4 c9 vpaddq %ymm1, %ymm0, %ymm1
40056c: c5 fe 7f 0c 24 vmovdqu %ymm1, (%rsp)
400571: c5 fc 10 04 24 vmovups (%rsp), %ymm0
400576: 48 83 c3 08 addq $8, %rbx
40057a: ff cd decl %ebp
40057c: 75 c2 jne -62
40057e: c4 e1 f9 7e c0 vmovq %xmm0, %rax
400583: 48 83 c4 28 addq $40, %rsp
400587: 5b popq %rbx
400588: 5d popq %rbp
400589: c5 f8 77 vzeroupper
40058c: c3 retq
40058d: 0f 1f 00 nopl (%rax)
which should be precisely what's expected.
If you look at the IR generated from clang:
dzur:~/tmp> ~/builds/build-llvm/Debug+Asserts/bin/clang -O3 -flto -mavx2
bar.c -S -o -
; ModuleID = 'bar.c'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable
define i32 @main(i32 %argc, i8** nocapture readonly %argv) #0 {
entry:
%conv = sext i32 %argc to i64
%splat.splatinsert = insertelement <4 x i64> undef, i64 %conv, i32 0
%splat.splat = shufflevector <4 x i64> %splat.splatinsert, <4 x i64>
undef, <4 x i32> zeroinitializer
%cmp14 = icmp sgt i32 %argc, 0
br i1 %cmp14, label %for.body.preheader, label %for.end
for.body.preheader: ; preds = %entry
br label %for.body
for.body: ; preds =
%for.body.preheader, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0,
%for.body.preheader ]
%y.015 = phi <4 x i64> [ %add, %for.body ], [ %splat.splat,
%for.body.preheader ]
%arrayidx = getelementptr inbounds i8*, i8** %argv, i64 %indvars.iv
%0 = load i8*, i8** %arrayidx, align 8, !tbaa !1
%call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #2
%sext = shl i64 %call.i, 32
%conv5 = ashr exact i64 %sext, 32
%splat.splatinsert6 = insertelement <4 x i64> undef, i64 %conv5, i32 0
%splat.splat7 = shufflevector <4 x i64> %splat.splatinsert6, <4 x i64>
undef, <4 x i32> zeroinitializer
%add = add <4 x i64> %splat.splat7, %y.015
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %argc
br i1 %exitcond, label %for.end.loopexit, label %for.body
for.end.loopexit: ; preds = %for.body
%add.lcssa = phi <4 x i64> [ %add, %for.body ]
br label %for.end
for.end: ; preds =
%for.end.loopexit, %entry
%y.0.lcssa = phi <4 x i64> [ %splat.splat, %entry ], [ %add.lcssa,
%for.end.loopexit ]
%1 = extractelement <4 x i64> %y.0.lcssa, i64 0
%conv8 = trunc i64 %1 to i32
ret i32 %conv8
}
; Function Attrs: nounwind
declare i64 @strtol(i8* readonly, i8** nocapture, i32) #1
attributes #0 = { nounwind uwtable "less-precise-fpmad"="false"
"no-frame-pointer-elim"="false" "no-infs-fp-math"="false"
"no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
"target-cpu"="x86-64"
"target-features"="+sse4.2,+avx2,+ssse3,+sse3,+sse,+sse2,+sse4.1,+avx,+popcnt"
"unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind "less-precise-fpmad"="false"
"no-frame-pointer-elim"="false" "no-infs-fp-math"="false"
"no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
"target-cpu"="x86-64"
"target-features"="+sse4.2,+avx2,+ssse3,+sse3,+sse,+sse2,+sse4.1,+avx,+popcnt"
"unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind }
!llvm.ident = !{!0}
!0 = !{!"clang version 3.7.0 (trunk 233238) (llvm/trunk 233240)"}
!1 = !{!2, !2, i64 0}
!2 = !{!"any pointer", !3, i64 0}
!3 = !{!"omnipotent char", !4, i64 0}
!4 = !{!"Simple C/C++ TBAA"}
You'll see that the target-features for the function are set to the
appropriate level of avx.
I'll take a look at the PR to make sure we can just close it.
-eric
On Thu, Mar 26, 2015 at 4:23 PM David Blaikie <dblaikie at gmail.com> wrote:
> Eric - this sounds related to your recent work. I assume we should be
> producing the same asm, but not by passing any flags to the linker, instead
> by using function attributes?
>
>
> REPOSITORY
> rL LLVM
>
> http://reviews.llvm.org/D8629
>
> EMAIL PREFERENCES
> http://reviews.llvm.org/settings/panel/emailpreferences/
>
>
>
> _______________________________________________
> cfe-commits mailing list
> cfe-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20150326/e83b7ba5/attachment.html>
More information about the cfe-commits
mailing list