[PATCH] D82721: [ScalarEvolution] createSCEV(): MatchBinaryOp(): recognize `urem` disguised as an `srem`
Roman Lebedev via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 29 12:28:58 PDT 2020
lebedev.ri added a comment.
In D82721#2120692 <https://reviews.llvm.org/D82721#2120692>, @efriedma wrote:
> Not sure I understand what benefit you're expecting on the linked testcase.
Since in SCEV it now becomes some expression (as opposed to just `SCEVUnknown`),
it can reason about it more. In particular, SCEVExpander tries to hoist stuff
as much as possible, and now that it is no longer some opaque `SCEVUnknown`,
but an `SCEVURem` with non-zero divisor, it gets hoisted.
Ex 1.: `i & 1`:
*** IR Dump Before rewrite allocas if that makes them promotable ***
; Function Attrs: uwtable
define dso_local void @_Z4loopi(i32 %width) local_unnamed_addr #0 {
entry:
%storage = alloca [2 x i32], align 4
%0 = bitcast [2 x i32]* %storage to i8*
call void @llvm.lifetime.start.p0i8(i64 8, i8* %0) #3
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%cmp = icmp ne i32 %i.0, %width
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
call void @llvm.lifetime.end.p0i8(i64 8, i8* %0) #3
ret void
for.body: ; preds = %for.cond
%and = and i32 %i.0, 1
%1 = zext i32 %and to i64
%arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %storage, i64 0, i64 %1
%2 = load i32, i32* %arrayidx, align 4, !tbaa !2
%call = call i32 @_Z3adji(i32 %2)
%3 = load i32, i32* %arrayidx, align 4, !tbaa !2
%add = add nsw i32 %3, %call
store i32 %add, i32* %arrayidx, align 4, !tbaa !2
%inc = add nsw i32 %i.0, 1
br label %for.cond
}
*** IR Dump After rewrite allocas if that makes them promotable ***
; Function Attrs: uwtable
define dso_local void @_Z4loopi(i32 %width) local_unnamed_addr #0 {
entry:
%storage.apc.retyped = alloca <8 x i8>, align 4
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%0 = mul i32 %i.0, -1
%1 = trunc i32 %0 to i1
%2 = zext i1 %1 to i64
%cmp = icmp ne i32 %i.0, %width
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
ret void
for.body: ; preds = %for.cond
%3 = load <8 x i8>, <8 x i8>* %storage.apc.retyped, align 4
%4 = bitcast <8 x i8> %3 to <2 x i32>
%5 = extractelement <2 x i32> %4, i64 %2
%call = call i32 @_Z3adji(i32 %5)
%6 = load <8 x i8>, <8 x i8>* %storage.apc.retyped, align 4
%7 = bitcast <8 x i8> %6 to <2 x i32>
%8 = extractelement <2 x i32> %7, i64 %2
%add = add nsw i32 %8, %call
%9 = load <8 x i8>, <8 x i8>* %storage.apc.retyped, align 4
%10 = bitcast <8 x i8> %9 to <2 x i32>
%11 = insertelement <2 x i32> %10, i32 %add, i64 %2
%12 = bitcast <2 x i32> %11 to <8 x i8>
store <8 x i8> %12, <8 x i8>* %storage.apc.retyped, align 4
%inc = add nsw i32 %i.0, 1
br label %for.cond
}
So it got hoisted. But now,
Ex. 2: `i % 2`:
*** IR Dump Before rewrite allocas if that makes them promotable ***
; Function Attrs: uwtable
define dso_local void @_Z4loopi(i32 %width) local_unnamed_addr #0 {
entry:
%storage = alloca [2 x i32], align 4
%0 = bitcast [2 x i32]* %storage to i8*
call void @llvm.lifetime.start.p0i8(i64 8, i8* %0) #3
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%cmp = icmp ne i32 %i.0, %width
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
call void @llvm.lifetime.end.p0i8(i64 8, i8* %0) #3
ret void
for.body: ; preds = %for.cond
%rem = srem i32 %i.0, 2
%idxprom = sext i32 %rem to i64
%arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %storage, i64 0, i64 %idxprom
%1 = load i32, i32* %arrayidx, align 4, !tbaa !2
%call = call i32 @_Z3adji(i32 %1)
%2 = load i32, i32* %arrayidx, align 4, !tbaa !2
%add = add nsw i32 %2, %call
store i32 %add, i32* %arrayidx, align 4, !tbaa !2
%inc = add nsw i32 %i.0, 1
br label %for.cond
}
*** IR Dump After rewrite allocas if that makes them promotable ***
; Function Attrs: uwtable
define dso_local void @_Z4loopi(i32 %width) local_unnamed_addr #0 {
entry:
%storage.apc.retyped = alloca <8 x i8>, align 4
br label %for.cond
for.cond: ; preds = %for.body, %entry
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%cmp = icmp ne i32 %i.0, %width
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
ret void
for.body: ; preds = %for.cond
%rem = srem i32 %i.0, 2
%idxprom = sext i32 %rem to i64
%0 = load <8 x i8>, <8 x i8>* %storage.apc.retyped, align 4
%1 = bitcast <8 x i8> %0 to <2 x i32>
%2 = extractelement <2 x i32> %1, i64 %idxprom
%call = call i32 @_Z3adji(i32 %2)
%3 = load <8 x i8>, <8 x i8>* %storage.apc.retyped, align 4
%4 = bitcast <8 x i8> %3 to <2 x i32>
%5 = extractelement <2 x i32> %4, i64 %idxprom
%add = add nsw i32 %5, %call
%6 = load <8 x i8>, <8 x i8>* %storage.apc.retyped, align 4
%7 = bitcast <8 x i8> %6 to <2 x i32>
%8 = insertelement <2 x i32> %7, i32 %add, i64 %idxprom
%9 = bitcast <2 x i32> %8 to <8 x i8>
store <8 x i8> %9, <8 x i8>* %storage.apc.retyped, align 4
%inc = add nsw i32 %i.0, 1
br label %for.cond
}
Nope, not hoisted without this patch.
That's an inconsistency. It leads to other differences, e.g.:
*** IR Dump Before Rotate Loops ***
; Preheader:
entry:
br label %for.cond
; Loop:
for.cond: ; preds = %for.body, %entry
<badref> = phi <2 x i32> [ undef, %entry ], [ %4, %for.body ]
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
<badref> = and i32 %i.0, 1
<badref> = zext i32 %1 to i64
%cmp = icmp eq i32 %i.0, %width
br i1 %cmp, label %for.cond.cleanup, label %for.body
for.body: ; preds = %for.cond
<badref> = extractelement <2 x i32> %0, i64 %2
%call = tail call i32 @_Z3adji(i32 %3)
%add = add nsw i32 %call, %3
<badref> = insertelement <2 x i32> %0, i32 %add, i64 %2
%inc = add nuw nsw i32 %i.0, 1
br label %for.cond
; Exit blocks
for.cond.cleanup: ; preds = %for.cond
ret void
*** IR Dump After Rotate Loops ***
; Preheader:
for.body.lr.ph: ; preds = %entry
br label %for.body
; Loop:
for.body: ; preds = %for.body.lr.ph, %for.body
<badref> = phi i64 [ 0, %for.body.lr.ph ], [ %5, %for.body ]
%i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
<badref> = phi <2 x i32> [ undef, %for.body.lr.ph ], [ %3, %for.body ]
<badref> = extractelement <2 x i32> %1, i64 %0
%call = tail call i32 @_Z3adji(i32 %2)
%add = add nsw i32 %call, %2
<badref> = insertelement <2 x i32> %1, i32 %add, i64 %0
%inc = add nuw nsw i32 %i.07, 1
<badref> = and i32 %inc, 1
<badref> = zext i32 %4 to i64
%cmp = icmp eq i32 %inc, %width
br i1 %cmp, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
; Exit blocks
for.cond.for.cond.cleanup_crit_edge: ; preds = %for.body
br label %for.cond.cleanup
vs.
*** IR Dump Before Rotate Loops ***
; Preheader:
entry:
br label %for.cond
; Loop:
for.cond: ; preds = %for.body, %entry
<badref> = phi <2 x i32> [ undef, %entry ], [ %2, %for.body ]
%i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%cmp = icmp eq i32 %i.0, %width
br i1 %cmp, label %for.cond.cleanup, label %for.body
for.body: ; preds = %for.cond
%rem = and i32 %i.0, 1
%idxprom = zext i32 %rem to i64
<badref> = extractelement <2 x i32> %0, i64 %idxprom
%call = tail call i32 @_Z3adji(i32 %1)
%add = add nsw i32 %call, %1
<badref> = insertelement <2 x i32> %0, i32 %add, i64 %idxprom
%inc = add nuw nsw i32 %i.0, 1
br label %for.cond
; Exit blocks
for.cond.cleanup: ; preds = %for.cond
ret void
*** IR Dump After Rotate Loops ***
; Preheader:
for.body.lr.ph: ; preds = %entry
br label %for.body
; Loop:
for.body: ; preds = %for.body.lr.ph, %for.body
%i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
<badref> = phi <2 x i32> [ undef, %for.body.lr.ph ], [ %2, %for.body ]
%rem = and i32 %i.07, 1
%idxprom = zext i32 %rem to i64
<badref> = extractelement <2 x i32> %0, i64 %idxprom
%call = tail call i32 @_Z3adji(i32 %1)
%add = add nsw i32 %call, %1
<badref> = insertelement <2 x i32> %0, i32 %add, i64 %idxprom
%inc = add nuw nsw i32 %i.07, 1
%cmp = icmp eq i32 %inc, %width
br i1 %cmp, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
; Exit blocks
for.cond.for.cond.cleanup_crit_edge: ; preds = %for.body
br label %for.cond.cleanup
I'm not going to argue which result is better, but *normally* there wouldn't be such an `srem`,
so the current outcome is an outlier, and i would think we would prefer to not have such odd outliers.
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D82721/new/
https://reviews.llvm.org/D82721
More information about the llvm-commits
mailing list