[LLVMdev] How can I remove these redundant copy between registers?
zan jyu Wong
zyfwong at gmail.com
Thu May 21 19:26:26 PDT 2015
Hi Sam, Thanks for your helping.
I've never noticed OPT before, and I tried to run it on the bitcode, but
still I get the code listed above.
FYI, I did as the following:
$ clang -c -m32 -O3 -emit-llvm ex11.c -o ex11.bc
$ opt -S -gvn ex11.bc > ex11.ll
$ llc -march=bfin ex11.ll
Is there any thing I'm missing?
And the following is how I did before:
$ clang -S -m32 -emit-llvm -O3 file.c -o file.ll
$ llc -march=bfin file.ll
Original C Source File:
1 typedef struct state {
2 int V[8][8];
3 int *offset[8];
4 } state_t;
5
6 void foo(state_t* state, int ch, int *buffer)
7 {
8 int *offset = state->offset[ch];
9
10 int idx, i;
11 for (i = 0, idx = 0; i < 100; i++, idx += 5) {
12 //long long tmp = 0;
13 int tmp = 0;
14 for (int j = 0; j < 2; j++) {
15 tmp += state->V[ch][offset[i]+2*j+0]*buffer[idx + j];
16 tmp += state->V[ch][offset[i]+2*j+1]*buffer[idx + j];
17 }
18
19 // disable optimization
20 //volatile long long ret = tmp;
21 volatile int ret = tmp;
22 }
23 }
.ll file after run OPT on .bc file
; ModuleID = 'ex11.bc'
target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
target triple = "i386-apple-macosx10.10.0"
%struct.state = type { [8 x [8 x i32]], [8 x i32*] }
; Function Attrs: nounwind ssp
define void @foo(%struct.state* nocapture readonly %state, i32 %ch, i32*
nocapture readonly %buffer) #0 {
entry:
%ret = alloca i32, align 4
%arrayidx = getelementptr inbounds %struct.state* %state, i32 0, i32 1,
i32 %ch
%0 = load i32** %arrayidx, align 4, !tbaa !2
br label %for.cond3.preheader
for.cond3.preheader: ; preds =
%for.cond3.preheader, %entry
%i.052 = phi i32 [ 0, %entry ], [ %inc27, %for.cond3.preheader ]
%idx.051 = phi i32 [ 0, %entry ], [ %add28, %for.cond3.preheader ]
%arrayidx6 = getelementptr inbounds i32* %0, i32 %i.052
%1 = load i32* %arrayidx6, align 4, !tbaa !6
%arrayidx9 = getelementptr inbounds %struct.state* %state, i32 0, i32 0,
i32 %ch, i32 %1
%2 = load i32* %arrayidx9, align 4, !tbaa !6
%arrayidx11 = getelementptr inbounds i32* %buffer, i32 %idx.051
%3 = load i32* %arrayidx11, align 4, !tbaa !6
%add17 = add nsw i32 %1, 1
%arrayidx20 = getelementptr inbounds %struct.state* %state, i32 0, i32 0,
i32 %ch, i32 %add17
%4 = load i32* %arrayidx20, align 4, !tbaa !6
%tmp = add i32 %4, %2
%tmp48 = mul i32 %tmp, %3
%add.1 = add nsw i32 %1, 2
%arrayidx9.1 = getelementptr inbounds %struct.state* %state, i32 0, i32
0, i32 %ch, i32 %add.1
%5 = load i32* %arrayidx9.1, align 4, !tbaa !6
%add10.1 = add nuw nsw i32 %idx.051, 1
%arrayidx11.1 = getelementptr inbounds i32* %buffer, i32 %add10.1
%6 = load i32* %arrayidx11.1, align 4, !tbaa !6
%add17.1 = add nsw i32 %1, 3
%arrayidx20.1 = getelementptr inbounds %struct.state* %state, i32 0, i32
0, i32 %ch, i32 %add17.1
%7 = load i32* %arrayidx20.1, align 4, !tbaa !6
%tmp.1 = add i32 %7, %5
%tmp48.1 = mul i32 %tmp.1, %6
%add24.1 = add i32 %tmp48.1, %tmp48
store volatile i32 %add24.1, i32* %ret, align 4
%inc27 = add nuw nsw i32 %i.052, 1
%add28 = add nuw nsw i32 %idx.051, 5
%exitcond53 = icmp eq i32 %inc27, 100
br i1 %exitcond53, label %for.end29, label %for.cond3.preheader
for.end29: ; preds =
%for.cond3.preheader
ret void
}
attributes #0 = { nounwind ssp "less-precise-fpmad"="false"
"no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"
"no-infs-fp-math"="false" "no-nans-fp-math"="false"
"stack-protector-buffer-size"="8" "unsafe-fp-math"="false"
"use-soft-float"="false" }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
!0 = !{i32 1, !"PIC Level", i32 2}
!1 = !{!"clang version 3.6.0 (tags/RELEASE_360/final)"}
!2 = !{!3, !3, i64 0}
!3 = !{!"any pointer", !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C/C++ TBAA"}
!6 = !{!7, !7, i64 0}
!7 = !{!"int", !4, i64 0}
And the generated .s file
.text
.macosx_version_min 10, 10
.file "ex11.ll"
.globl foo
.align 4
.type foo, at function
foo: // @foo
// BB#0: // %entry
link 16;
[fp - 4] = r4;
[fp - 8] = r5;
[fp - 12] = r6;
r3 = r1 << 2;
r4 = r0 + r3;
r3 = 0 (x);
r2 += 4;
p0 = r4;
r4 = [p0 + 256];
p0 = r2;
LBB0_1: // %for.cond3.preheader
// =>This Inner Loop Header: Depth=1
r2 = r1 << 5;
r2 = r0 + r2;
r5 = r4 + r3;
p1 = r5;
r5 = [p1];
r5 = r5 << 2;
r2 = r2 + r5;
p1 = r2; <--------------
r5 = [p1];
p1 = r2; <--------------- redundant copy
r6 = [p1 + 4];
r5 = r6 + r5;
r6 = [p0 + -4];
r5 *= r6;
p1 = r2; <--------------- redundant copy
r6 = [p1 + 8];
p1 = r2; <--------------- redundant copy
r2 = [p1 + 12];
r2 = r2 + r6;
r6 = [p0];
r2 *= r6;
r2 = r2 + r5;
[fp - 16] = r2;
r2 = p0;
r2 += 20;
r3 += 4;
r5 = 400 (z);
cc = r3 == r5;
p0 = r2;
if !cc jump LBB0_1;
jump LBB0_2;
LBB0_2: // %for.end29
r6 = [fp - 12];
r5 = [fp - 8];
r4 = [fp - 4];
unlink;
rts;
Ltmp0:
.size foo, Ltmp0-foo
Huang
On Fri, May 22, 2015 at 12:24 AM, Samuel Crow <samueldcrow at gmail.com> wrote:
>
> On May 21, 2015, at 7:21 AM, zan jyu Wong wrote:
>
> > Hi,
> >
> > I've been working on a Blackfin backend (llvm-3.6.0) based on the
> previous one that was removed in llvm-3.1.
> > llc generates codes like this:
> >
> > 29 p1 = r2;
> > 30 r5 = [p1];
> > 31 p1 = r2;
> > 32 r6 = [p1 + 4];
> > 33 r5 = r6 + r5;
> > 34 r6 = [p0 + -4];
> > 35 r5 *= r6;
> > 36 p1 = r2;
> > 37 r6 = [p1 + 8];
> > 38 p1 = r2;
> >
> > p1 and r2 are in different register classes.
> > A p* register can be used for load/stroe values from memory while a r*
> register can not.
> >
> > As we can see, line 31, 36, 38 can be deleted. How can I configure llc
> to do this? Or do I have to write a custom pass to do this optimization?
> Any suggestion is welcome.
> >
> > Thanks,
> >
> > Huang
>
> Hello Huang,
>
> SIlly as this may sound, did you run OPT on the bitcode first before using
> LLC?
>
> Cheers,
>
> Sam
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20150522/71f36cf0/attachment.html>
More information about the llvm-dev
mailing list