[LLVMdev] InstCombine adds bit masks, confuses self, others
Jakob Stoklund Olesen
stoklund at 2pi.dk
Tue Apr 17 10:40:16 PDT 2012
On Apr 17, 2012, at 6:09 AM, Chandler Carruth <chandlerc at google.com> wrote:
> I really dislike hasOneUse-based "solutions" at the IR / InstCombine layer. They result in strange artifacts during optimization: places where adding similar code turns off optimizations because we fold the similar bits together and reuse parts of the computation.
In my example, it won't even work. The second use of 'x+x' only emerges after GVN, after the harm has been done.
However, I am more concerned about hiding things from scalar evolution than from InstCombine itself. Here is an example I was expecting to fail:
$ cat small2.c
struct desc {
unsigned size : 2;
unsigned skip : 6;
unsigned value;
};
void f(struct desc d, unsigned *p) {
for (unsigned i = 0; i != 64; ++i) {
*p = 0; p += d.skip;
*p = 1; p += d.skip;
}
}
It does the right thing, though:
define void @f(i64 %d.coerce, i32* nocapture %p) nounwind uwtable ssp {
entry:
%0 = lshr i64 %d.coerce, 2
%bf.clear = and i64 %0, 63
%add.ptr.sum = shl nuw nsw i64 %bf.clear, 1
br label %for.body
for.body: ; preds = %entry, %for.body
%p.addr.06 = phi i32* [ %p, %entry ], [ %add.ptr3, %for.body ]
%i.05 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
store i32 0, i32* %p.addr.06, align 4, !tbaa !0
%add.ptr = getelementptr inbounds i32* %p.addr.06, i64 %bf.clear
store i32 1, i32* %add.ptr, align 4, !tbaa !0
%add.ptr3 = getelementptr inbounds i32* %p.addr.06, i64 %add.ptr.sum
%inc = add i32 %i.05, 1
%cmp = icmp eq i32 %inc, 64
br i1 %cmp, label %for.end, label %for.body
for.end: ; preds = %for.body
ret void
}
Notice how the two gep offsets %bf.clear and %add.ptr.sum have a simple arithmetic relationship that scalar evolution can figure out.
With a very small change, that breaks:
$ cat small3.c
void f(unsigned long d, unsigned *p) {
unsigned long skip = d/4;
for (unsigned i = 0; i != 64; ++i) {
*p = 0; p += skip;
*p = 1; p += skip;
}
}
define void @f(i64 %d, i32* nocapture %p) nounwind uwtable ssp {
entry:
%div = lshr i64 %d, 2
%0 = lshr i64 %d, 1
%add.ptr.sum = and i64 %0, 9223372036854775806
br label %for.body
for.body: ; preds = %entry, %for.body
%i.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%p.addr.02 = phi i32* [ %p, %entry ], [ %add.ptr1, %for.body ]
store i32 0, i32* %p.addr.02, align 4, !tbaa !0
%add.ptr = getelementptr inbounds i32* %p.addr.02, i64 %div
store i32 1, i32* %add.ptr, align 4, !tbaa !0
%add.ptr1 = getelementptr inbounds i32* %p.addr.02, i64 %add.ptr.sum
%inc = add i32 %i.03, 1
%cmp = icmp eq i32 %inc, 64
br i1 %cmp, label %for.end, label %for.body
for.end: ; preds = %for.body
ret void
}
Now it suddenly becomes very difficult for scalar evolution to figure out the relationship between %div and %add.ptr.sum.
I think the canonical form should look more like the bit field example - don't hide the shl behind a bit mask.
I'll try deferring the transforms that hide an shl instruction to DAGCombine. I'll run some benchmarks.
/jakob
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20120417/be34ad55/attachment.html>
More information about the llvm-dev
mailing list