[llvm-dev] LLVM 6.0's LoopUnroll PASS is not able to work?
翟翔 via llvm-dev
llvm-dev at lists.llvm.org
Thu Oct 26 17:12:31 PDT 2017
Hi Jatin,
Thanks for your kind response!
The original source: https://github.com/ScaffCC/ScaffCC/blob/master/Algorithms/Cat_State/cat_state.n04.scaffold
I will deep into the root cause why disable unroll meta data.
发自我的iPhone
------------------ Original ------------------
From: Jatin Bhateja <jatin.bhateja at gmail.com>
Date: 周五,10月 27,2017 0:54 上午
To: Leslie Zhai <lesliezhai at llvm.org.cn>
Cc: Michael Kruse <llvmdev at meinersbur.de>, LLVM Developers Mailing List <llvm-dev at lists.llvm.org>
Subject: Re: [llvm-dev] LLVM 6.0's LoopUnroll PASS is not able to work?
Hi Leslie,
There is a disable unroll meta data (!llvm.loop !2) associated with unCatN loop basic block , probaly in the source pragma clang loop unroll (disable) was used before the loop.
I tried removing that and used -unroll-count=4 both the catN and uncatN were unrolled.
Options : -mem2reg -loops -loop-simplify -loop-rotate -lcssa -loop-unroll -unroll-count=4 -sccp -simplifycfg -o /tmp/1 -debug-only=loop-unroll.
Loop Unroll: F[catN] Loop %for.body
Loop Size = 17
UNROLLING loop %for.body by 4!
Loop Unroll: F[unCatN] Loop %for.body
Loop Size = 9
UNROLLING loop %for.body by 4 with run-time trip count!
Thanks,
Jatin
On Wed, Oct 25, 2017 at 10:17 AM, Leslie Zhai via llvm-dev <llvm-dev at lists.llvm.org> wrote:
Hi Michael,
Dropped *optnone* still failed to unroll loops for this testcase:
$ cat cat_state.n044.ll
; ModuleID = 'cat_state.n045a.ll'
source_filename = "cat_state.n04_merged.scaffold"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: noinline nounwind uwtable
define void @catN(i16* %bit, i32 %n) local_unnamed_addr #0 {
entry:
%0 = load i16, i16* %bit, align 2
tail call void @llvm.H.i16(i16 %0)
%cmp1 = icmp sgt i32 %n, 1
br i1 %cmp1, label %for.body.lr.ph, label %for.end
for.body.lr.ph: ; preds = %entry
%1 = add i32 %n, -1
%2 = add i32 %n, -2
%xtraiter = and i32 %1, 1
%3 = icmp ult i32 %2, 1
br i1 %3, label %for.cond.for.end_crit_edge.unr-lcssa, label %for.body.lr.ph.new
for.body.lr.ph.new: ; preds = %for.body.lr.ph
%unroll_iter = sub i32 %1, %xtraiter
br label %for.body
for.body: ; preds = %for.body, %for.body.lr.ph.new
%inc3 = phi i32 [ 1, %for.body.lr.ph.new ], [ %inc.1, %for.body ]
%niter = phi i32 [ %unroll_iter, %for.body.lr.ph.new ], [ %niter.nsub.1, %for.body ]
%idxprom = sext i32 %inc3 to i64
%arrayidx1 = getelementptr inbounds i16, i16* %bit, i64 %idxprom
%4 = load i16, i16* %arrayidx1, align 2
%sub = add nsw i32 %inc3, -1
%idxprom2 = sext i32 %sub to i64
%arrayidx3 = getelementptr inbounds i16, i16* %bit, i64 %idxprom2
%5 = load i16, i16* %arrayidx3, align 2
tail call void @llvm.CNOT.i16.i16(i16 %4, i16 %5)
%inc = add nsw i32 %inc3, 1
%niter.nsub = sub i32 %niter, 1
%idxprom.1 = sext i32 %inc to i64
%arrayidx1.1 = getelementptr inbounds i16, i16* %bit, i64 %idxprom.1
%6 = load i16, i16* %arrayidx1.1, align 2
%idxprom2.1 = sext i32 %inc3 to i64
%arrayidx3.1 = getelementptr inbounds i16, i16* %bit, i64 %idxprom2.1
%7 = load i16, i16* %arrayidx3.1, align 2
tail call void @llvm.CNOT.i16.i16(i16 %6, i16 %7)
%inc.1 = add nsw i32 %inc, 1
%niter.nsub.1 = sub i32 %niter.nsub, 1
%niter.ncmp.1 = icmp ne i32 %niter.nsub.1, 0
br i1 %niter.ncmp.1, label %for.body, label %for.cond.for.end_crit_edge.unr-lcssa
for.cond.for.end_crit_edge.unr-lcssa: ; preds = %for.body, %for.body.lr.ph
%inc3.unr = phi i32 [ 1, %for.body.lr.ph ], [ %inc.1, %for.body ]
%lcmp.mod = icmp ne i32 %xtraiter, 0
br i1 %lcmp.mod, label %for.body.epil, label %for.end
for.body.epil: ; preds = %for.cond.for.end_crit_edge.unr-lcssa
%inc3.epil = phi i32 [ %inc3.unr, %for.cond.for.end_crit_edge.unr-lcssa ]
%idxprom.epil = sext i32 %inc3.epil to i64
%arrayidx1.epil = getelementptr inbounds i16, i16* %bit, i64 %idxprom.epil
%8 = load i16, i16* %arrayidx1.epil, align 2
%sub.epil = add nsw i32 %inc3.epil, -1
%idxprom2.epil = sext i32 %sub.epil to i64
%arrayidx3.epil = getelementptr inbounds i16, i16* %bit, i64 %idxprom2.epil
%9 = load i16, i16* %arrayidx3.epil, align 2
tail call void @llvm.CNOT.i16.i16(i16 %8, i16 %9)
%inc.epil = add nsw i32 %inc3.epil, 1
%cmp.epil = icmp slt i32 %inc.epil, %n
br label %for.end
for.end: ; preds = %for.body.epil, %for.cond.for.end_crit_edge.unr-lcssa, %entry
ret void
}
; Function Attrs: nounwind
declare void @llvm.H.i16(i16) #1
; Function Attrs: nounwind
declare void @llvm.CNOT.i16.i16(i16, i16) #1
; Function Attrs: noinline nounwind uwtable
define void @unCatN(i16* %bit, i32 %n) local_unnamed_addr #0 {
entry:
%storemerge1 = add nsw i32 %n, -1
%cmp2 = icmp sgt i32 %n, 1
br i1 %cmp2, label %for.body.peel, label %for.end
for.body.peel: ; preds = %entry
%idxprom.peel = sext i32 %storemerge1 to i64
%arrayidx.peel = getelementptr inbounds i16, i16* %bit, i64 %idxprom.peel
%0 = load i16, i16* %arrayidx.peel, align 2
%sub1.peel = add nsw i32 %n, -2
%idxprom2.peel = sext i32 %sub1.peel to i64
%arrayidx3.peel = getelementptr inbounds i16, i16* %bit, i64 %idxprom2.peel
%1 = load i16, i16* %arrayidx3.peel, align 2
tail call void @llvm.CNOT.i16.i16(i16 %0, i16 %1)
%storemerge.peel = add nsw i32 %storemerge1, -1
%cmp.peel = icmp sgt i32 %storemerge1, 1
br i1 %cmp.peel, label %for.body.lr.ph.peel.newph, label %for.end
for.body.lr.ph.peel.newph: ; preds = %for.body.peel
br label %for.body
for.body: ; preds = %for.body, %for.body.lr.ph.peel.newph
%storemerge5 = phi i32 [ %storemerge.peel, %for.body.lr.ph.peel.newph ], [ %storemerge, %for.body ]
%storemerge.in3 = phi i32 [ %storemerge1, %for.body.lr.ph.peel.newph ], [ %storemerge5, %for.body ]
%idxprom = sext i32 %storemerge5 to i64
%arrayidx = getelementptr inbounds i16, i16* %bit, i64 %idxprom
%2 = load i16, i16* %arrayidx, align 2
%sub1 = add nsw i32 %storemerge.in3, -2
%idxprom2 = sext i32 %sub1 to i64
%arrayidx3 = getelementptr inbounds i16, i16* %bit, i64 %idxprom2
%3 = load i16, i16* %arrayidx3, align 2
tail call void @llvm.CNOT.i16.i16(i16 %2, i16 %3)
%storemerge = add nsw i32 %storemerge5, -1
%cmp = icmp sgt i32 %storemerge5, 1
br i1 %cmp, label %for.body, label %for.end, !llvm.loop !2
for.end: ; preds = %for.body, %for.body.peel, %entry
%.lcssa = phi i16* [ %bit, %entry ], [ %bit, %for.body.peel ], [ %bit, %for.body ]
%4 = load i16, i16* %.lcssa, align 2
tail call void @llvm.H.i16(i16 %4)
ret void
}
; Function Attrs: noinline nounwind uwtable
define i32 @main() local_unnamed_addr #0 {
entry:
%bits = alloca [4 x i16], align 2
%arraydecay = getelementptr inbounds [4 x i16], [4 x i16]* %bits, i64 0, i64 0
call void @catN_IP4_IPx_IPx_IPx_DPx_DPx_DPx_DPx(i16* %arraydecay, i32 undef)
ret i32 0
}
define void @catN_IP4_IPx_IPx_IPx_DPx_DPx_DPx_DPx(i16* %bit, i32 %n) {
entry.:
%0 = load i16, i16* %bit, align 2
tail call void @llvm.H.i16(i16 %0)
%arrayidx1. = getelementptr inbounds i16, i16* %bit, i64 1
%1 = load i16, i16* %arrayidx1., align 2
%2 = load i16, i16* %bit, align 2
tail call void @llvm.CNOT.i16.i16(i16 %1, i16 %2)
%arrayidx1.1. = getelementptr inbounds i16, i16* %bit, i64 2
%3 = load i16, i16* %arrayidx1.1., align 2
%arrayidx3.1. = getelementptr inbounds i16, i16* %bit, i64 1
%4 = load i16, i16* %arrayidx3.1., align 2
tail call void @llvm.CNOT.i16.i16(i16 %3, i16 %4)
%arrayidx1.epil. = getelementptr inbounds i16, i16* %bit, i64 3
%5 = load i16, i16* %arrayidx1.epil., align 2
%arrayidx3.epil. = getelementptr inbounds i16, i16* %bit, i64 2
%6 = load i16, i16* %arrayidx3.epil., align 2
tail call void @llvm.CNOT.i16.i16(i16 %5, i16 %6)
ret void
}
attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 6.0.0 (git at github.com:llvm-mirror/clang.git 0aed123216ad4a38a9c2b16f1783895fd5cb1a04) (git at github.com:llvm-mirror/llvm.git d209b37aec1e392dabbf9b5324ea4a60c36fbc55)"}
!2 = distinct !{!2, !3}
!3 = !{!"llvm.loop.unroll.disable"}
$(OPT) -S cat_state.n044.ll -mem2reg -loops -loop-simplify -loop-rotate -lcssa -loop-unroll -unroll-threshold=100000000 -sccp -simplifycfg -o cat_state.n045.ll
$ cat cat_state.n045.ll
; ModuleID = 'cat_state.n044.ll'
source_filename = "cat_state.n04_merged.scaffold"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: noinline nounwind uwtable
define void @catN(i16* %bit, i32 %n) local_unnamed_addr #0 {
entry:
%0 = load i16, i16* %bit, align 2
tail call void @llvm.H.i16(i16 %0)
%cmp1 = icmp sgt i32 %n, 1
br i1 %cmp1, label %for.body.lr.ph, label %for.end
for.body.lr.ph: ; preds = %entry
%1 = add i32 %n, -1
%2 = add i32 %n, -2
%xtraiter = and i32 %1, 1
%3 = icmp ult i32 %2, 1
br i1 %3, label %for.cond.for.end_crit_edge.unr-lcssa, label %for.body.lr.ph.new
for.body.lr.ph.new: ; preds = %for.body.lr.ph
%unroll_iter = sub i32 %1, %xtraiter
br label %for.body
for.body: ; preds = %for.body, %for.body.lr.ph.new
%inc3 = phi i32 [ 1, %for.body.lr.ph.new ], [ %inc.1, %for.body ]
%niter = phi i32 [ %unroll_iter, %for.body.lr.ph.new ], [ %niter.nsub.1, %for.body ]
%idxprom = sext i32 %inc3 to i64
%arrayidx1 = getelementptr inbounds i16, i16* %bit, i64 %idxprom
%4 = load i16, i16* %arrayidx1, align 2
%sub = add nsw i32 %inc3, -1
%idxprom2 = sext i32 %sub to i64
%arrayidx3 = getelementptr inbounds i16, i16* %bit, i64 %idxprom2
%5 = load i16, i16* %arrayidx3, align 2
tail call void @llvm.CNOT.i16.i16(i16 %4, i16 %5)
%inc = add nsw i32 %inc3, 1
%niter.nsub = sub i32 %niter, 1
%idxprom.1 = sext i32 %inc to i64
%arrayidx1.1 = getelementptr inbounds i16, i16* %bit, i64 %idxprom.1
%6 = load i16, i16* %arrayidx1.1, align 2
%idxprom2.1 = sext i32 %inc3 to i64
%arrayidx3.1 = getelementptr inbounds i16, i16* %bit, i64 %idxprom2.1
%7 = load i16, i16* %arrayidx3.1, align 2
tail call void @llvm.CNOT.i16.i16(i16 %6, i16 %7)
%inc.1 = add nsw i32 %inc, 1
%niter.nsub.1 = sub i32 %niter.nsub, 1
%niter.ncmp.1 = icmp ne i32 %niter.nsub.1, 0
br i1 %niter.ncmp.1, label %for.body, label %for.cond.for.end_crit_edge.unr-lcssa
for.cond.for.end_crit_edge.unr-lcssa: ; preds = %for.body, %for.body.lr.ph
%inc3.unr = phi i32 [ 1, %for.body.lr.ph ], [ %inc.1, %for.body ]
%lcmp.mod = icmp ne i32 %xtraiter, 0
br i1 %lcmp.mod, label %for.body.epil, label %for.end
for.body.epil: ; preds = %for.cond.for.end_crit_edge.unr-lcssa
%inc3.epil = phi i32 [ %inc3.unr, %for.cond.for.end_crit_edge.unr-lcssa ]
%idxprom.epil = sext i32 %inc3.epil to i64
%arrayidx1.epil = getelementptr inbounds i16, i16* %bit, i64 %idxprom.epil
%8 = load i16, i16* %arrayidx1.epil, align 2
%sub.epil = add nsw i32 %inc3.epil, -1
%idxprom2.epil = sext i32 %sub.epil to i64
%arrayidx3.epil = getelementptr inbounds i16, i16* %bit, i64 %idxprom2.epil
%9 = load i16, i16* %arrayidx3.epil, align 2
tail call void @llvm.CNOT.i16.i16(i16 %8, i16 %9)
%inc.epil = add nsw i32 %inc3.epil, 1
%cmp.epil = icmp slt i32 %inc.epil, %n
br label %for.end
for.end: ; preds = %for.body.epil, %for.cond.for.end_crit_edge.unr-lcssa, %entry
ret void
}
; Function Attrs: nounwind
declare void @llvm.H.i16(i16) #1
; Function Attrs: nounwind
declare void @llvm.CNOT.i16.i16(i16, i16) #1
; Function Attrs: noinline nounwind uwtable
define void @unCatN(i16* %bit, i32 %n) local_unnamed_addr #0 {
entry:
%storemerge1 = add nsw i32 %n, -1
%cmp2 = icmp sgt i32 %n, 1
br i1 %cmp2, label %for.body.peel, label %for.end
for.body.peel: ; preds = %entry
%idxprom.peel = sext i32 %storemerge1 to i64
%arrayidx.peel = getelementptr inbounds i16, i16* %bit, i64 %idxprom.peel
%0 = load i16, i16* %arrayidx.peel, align 2
%sub1.peel = add nsw i32 %n, -2
%idxprom2.peel = sext i32 %sub1.peel to i64
%arrayidx3.peel = getelementptr inbounds i16, i16* %bit, i64 %idxprom2.peel
%1 = load i16, i16* %arrayidx3.peel, align 2
tail call void @llvm.CNOT.i16.i16(i16 %0, i16 %1)
%storemerge.peel = add nsw i32 %storemerge1, -1
%cmp.peel = icmp sgt i32 %storemerge1, 1
br i1 %cmp.peel, label %for.body.lr.ph.peel.newph, label %for.end
for.body.lr.ph.peel.newph: ; preds = %for.body.peel
br label %for.body
for.body: ; preds = %for.body, %for.body.lr.ph.peel.newph
%storemerge5 = phi i32 [ %storemerge.peel, %for.body.lr.ph.peel.newph ], [ %storemerge, %for.body ]
%storemerge.in3 = phi i32 [ %storemerge1, %for.body.lr.ph.peel.newph ], [ %storemerge5, %for.body ]
%idxprom = sext i32 %storemerge5 to i64
%arrayidx = getelementptr inbounds i16, i16* %bit, i64 %idxprom
%2 = load i16, i16* %arrayidx, align 2
%sub1 = add nsw i32 %storemerge.in3, -2
%idxprom2 = sext i32 %sub1 to i64
%arrayidx3 = getelementptr inbounds i16, i16* %bit, i64 %idxprom2
%3 = load i16, i16* %arrayidx3, align 2
tail call void @llvm.CNOT.i16.i16(i16 %2, i16 %3)
%storemerge = add nsw i32 %storemerge5, -1
%cmp = icmp sgt i32 %storemerge5, 1
br i1 %cmp, label %for.body, label %for.end, !llvm.loop !2
for.end: ; preds = %for.body, %for.body.peel, %entry
%.lcssa = phi i16* [ %bit, %entry ], [ %bit, %for.body.peel ], [ %bit, %for.body ]
%4 = load i16, i16* %.lcssa, align 2
tail call void @llvm.H.i16(i16 %4)
ret void
}
; Function Attrs: noinline nounwind uwtable
define i32 @main() local_unnamed_addr #0 {
entry:
%bits = alloca [4 x i16], align 2
%arraydecay = getelementptr inbounds [4 x i16], [4 x i16]* %bits, i64 0, i64 0
call void @catN_IP4_IPx_IPx_IPx_DPx_DPx_DPx_DPx(i16* %arraydecay, i32 undef)
ret i32 0
}
define void @catN_IP4_IPx_IPx_IPx_DPx_DPx_DPx_DPx(i16* %bit, i32 %n) {
entry.:
%0 = load i16, i16* %bit, align 2
tail call void @llvm.H.i16(i16 %0)
%arrayidx1. = getelementptr inbounds i16, i16* %bit, i64 1
%1 = load i16, i16* %arrayidx1., align 2
%2 = load i16, i16* %bit, align 2
tail call void @llvm.CNOT.i16.i16(i16 %1, i16 %2)
%arrayidx1.1. = getelementptr inbounds i16, i16* %bit, i64 2
%3 = load i16, i16* %arrayidx1.1., align 2
%arrayidx3.1. = getelementptr inbounds i16, i16* %bit, i64 1
%4 = load i16, i16* %arrayidx3.1., align 2
tail call void @llvm.CNOT.i16.i16(i16 %3, i16 %4)
%arrayidx1.epil. = getelementptr inbounds i16, i16* %bit, i64 3
%5 = load i16, i16* %arrayidx1.epil., align 2
%arrayidx3.epil. = getelementptr inbounds i16, i16* %bit, i64 2
%6 = load i16, i16* %arrayidx3.epil., align 2
tail call void @llvm.CNOT.i16.i16(i16 %5, i16 %6)
ret void
}
attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 6.0.0 (git at github.com:llvm-mirror/clang.git 0aed123216ad4a38a9c2b16f1783895fd5cb1a04) (git at github.com:llvm-mirror/llvm.git d209b37aec1e392dabbf9b5324ea4a60c36fbc55)"}
!2 = distinct !{!2, !3}
!3 = !{!"llvm.loop.unroll.disable"}
There are still for *loops* in catN and unCatN Functions, workaround might be using GlobalDCE PASS towards cat_state.n045.ll to remove !Live DeadFunctions.
Cat_State testcase: https://github.com/ScaffCC/ScaffCC/blob/master/Algorithms/Cat_State/cat_state.n04.scaffold
Scaffold builtin gates: https://github.com/ScaffCC/scaff-clang/blob/master/include/clang/Basic/Builtins.def#L108
Ali JavadiAbhari, Shruti Patil, Daniel Kudrow, Jeff Heckey, Alexey Lvov, Frederic Chong and Margaret Martonosi, ScaffCC: A Framework for Compilation and Analysis of Quantum Computing Programs, ACM International Conference on Computing Frontiers (CF 2014), Cagliari, Italy, May 2014
在 2017年10月24日 12:52, Michael Kruse 写道:
2017-10-24 6:19 GMT+02:00 Leslie Zhai via llvm-dev <llvm-dev at lists.llvm.org <mailto:llvm-dev at lists.llvm.org>>:
> attributes #0 = { noinline nounwind *optnone *uwtable
The optnone attribute (added by clang in -O0) says LLV? to not apply any transformation. Avoid with -Xclang -disable-O0-optnone
Michael
--
Regards,
Leslie Zhai - https://reviews.llvm.org/p/xiangzhai/
_______________________________________________
LLVM Developers mailing list
llvm-dev at lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20171027/834f572e/attachment.html>
More information about the llvm-dev
mailing list