[PATCH] D82721: [ScalarEvolution] createSCEV(): MatchBinaryOp(): recognize `urem` disguised as an `srem`

Mon Jun 29 12:28:58 PDT 2020

lebedev.ri added a comment.

In D82721#2120692 <https://reviews.llvm.org/D82721#2120692>, @efriedma wrote:

> Not sure I understand what benefit you're expecting on the linked testcase.

Since in SCEV it now becomes some expression (as opposed to just `SCEVUnknown`),
it can reason about it more. In particular, SCEVExpander tries to hoist stuff
as much as possible, and now that it is no longer some opaque `SCEVUnknown`,
but an `SCEVURem` with non-zero divisor, it gets hoisted.

Ex 1.: `i & 1`:

  *** IR Dump Before rewrite allocas if that makes them promotable ***
  ; Function Attrs: uwtable
  define dso_local void @_Z4loopi(i32 %width) local_unnamed_addr #0 {
  entry:
    %storage = alloca [2 x i32], align 4
    %0 = bitcast [2 x i32]* %storage to i8*
    call void @llvm.lifetime.start.p0i8(i64 8, i8* %0) #3
    br label %for.cond

  for.cond:                                         ; preds = %for.body, %entry
    %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
    %cmp = icmp ne i32 %i.0, %width
    br i1 %cmp, label %for.body, label %for.cond.cleanup

  for.cond.cleanup:                                 ; preds = %for.cond
    call void @llvm.lifetime.end.p0i8(i64 8, i8* %0) #3
    ret void

  for.body:                                         ; preds = %for.cond
    %and = and i32 %i.0, 1
    %1 = zext i32 %and to i64
    %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %storage, i64 0, i64 %1
    %2 = load i32, i32* %arrayidx, align 4, !tbaa !2
    %call = call i32 @_Z3adji(i32 %2)
    %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
    %add = add nsw i32 %3, %call
    store i32 %add, i32* %arrayidx, align 4, !tbaa !2
    %inc = add nsw i32 %i.0, 1
    br label %for.cond
  }
  *** IR Dump After rewrite allocas if that makes them promotable ***
  ; Function Attrs: uwtable
  define dso_local void @_Z4loopi(i32 %width) local_unnamed_addr #0 {
  entry:
    %storage.apc.retyped = alloca <8 x i8>, align 4
    br label %for.cond

  for.cond:                                         ; preds = %for.body, %entry
    %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
    %0 = mul i32 %i.0, -1
    %1 = trunc i32 %0 to i1
    %2 = zext i1 %1 to i64
    %cmp = icmp ne i32 %i.0, %width
    br i1 %cmp, label %for.body, label %for.cond.cleanup

  for.cond.cleanup:                                 ; preds = %for.cond
    ret void

  for.body:                                         ; preds = %for.cond
    %3 = load <8 x i8>, <8 x i8>* %storage.apc.retyped, align 4
    %4 = bitcast <8 x i8> %3 to <2 x i32>
    %5 = extractelement <2 x i32> %4, i64 %2
    %call = call i32 @_Z3adji(i32 %5)
    %6 = load <8 x i8>, <8 x i8>* %storage.apc.retyped, align 4
    %7 = bitcast <8 x i8> %6 to <2 x i32>
    %8 = extractelement <2 x i32> %7, i64 %2
    %add = add nsw i32 %8, %call
    %9 = load <8 x i8>, <8 x i8>* %storage.apc.retyped, align 4
    %10 = bitcast <8 x i8> %9 to <2 x i32>
    %11 = insertelement <2 x i32> %10, i32 %add, i64 %2
    %12 = bitcast <2 x i32> %11 to <8 x i8>
    store <8 x i8> %12, <8 x i8>* %storage.apc.retyped, align 4
    %inc = add nsw i32 %i.0, 1
    br label %for.cond
  }

So it got hoisted. But now,

Ex. 2: `i % 2`:

  *** IR Dump Before rewrite allocas if that makes them promotable ***
  ; Function Attrs: uwtable
  define dso_local void @_Z4loopi(i32 %width) local_unnamed_addr #0 {
  entry:
    %storage = alloca [2 x i32], align 4
    %0 = bitcast [2 x i32]* %storage to i8*
    call void @llvm.lifetime.start.p0i8(i64 8, i8* %0) #3
    br label %for.cond

  for.cond:                                         ; preds = %for.body, %entry
    %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
    %cmp = icmp ne i32 %i.0, %width
    br i1 %cmp, label %for.body, label %for.cond.cleanup

  for.cond.cleanup:                                 ; preds = %for.cond
    call void @llvm.lifetime.end.p0i8(i64 8, i8* %0) #3
    ret void

  for.body:                                         ; preds = %for.cond
    %rem = srem i32 %i.0, 2
    %idxprom = sext i32 %rem to i64
    %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %storage, i64 0, i64 %idxprom
    %1 = load i32, i32* %arrayidx, align 4, !tbaa !2
    %call = call i32 @_Z3adji(i32 %1)
    %2 = load i32, i32* %arrayidx, align 4, !tbaa !2
    %add = add nsw i32 %2, %call
    store i32 %add, i32* %arrayidx, align 4, !tbaa !2
    %inc = add nsw i32 %i.0, 1
    br label %for.cond
  }
  *** IR Dump After rewrite allocas if that makes them promotable ***
  ; Function Attrs: uwtable
  define dso_local void @_Z4loopi(i32 %width) local_unnamed_addr #0 {
  entry:
    %storage.apc.retyped = alloca <8 x i8>, align 4
    br label %for.cond

  for.cond:                                         ; preds = %for.body, %entry
    %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
    %cmp = icmp ne i32 %i.0, %width
    br i1 %cmp, label %for.body, label %for.cond.cleanup

  for.cond.cleanup:                                 ; preds = %for.cond
    ret void

  for.body:                                         ; preds = %for.cond
    %rem = srem i32 %i.0, 2
    %idxprom = sext i32 %rem to i64
    %0 = load <8 x i8>, <8 x i8>* %storage.apc.retyped, align 4
    %1 = bitcast <8 x i8> %0 to <2 x i32>
    %2 = extractelement <2 x i32> %1, i64 %idxprom
    %call = call i32 @_Z3adji(i32 %2)
    %3 = load <8 x i8>, <8 x i8>* %storage.apc.retyped, align 4
    %4 = bitcast <8 x i8> %3 to <2 x i32>
    %5 = extractelement <2 x i32> %4, i64 %idxprom
    %add = add nsw i32 %5, %call
    %6 = load <8 x i8>, <8 x i8>* %storage.apc.retyped, align 4
    %7 = bitcast <8 x i8> %6 to <2 x i32>
    %8 = insertelement <2 x i32> %7, i32 %add, i64 %idxprom
    %9 = bitcast <2 x i32> %8 to <8 x i8>
    store <8 x i8> %9, <8 x i8>* %storage.apc.retyped, align 4
    %inc = add nsw i32 %i.0, 1
    br label %for.cond
  }

Nope, not hoisted without this patch.

That's an inconsistency. It leads to other differences, e.g.:

  *** IR Dump Before Rotate Loops ***
  ; Preheader:
  entry:
    br label %for.cond

  ; Loop:
  for.cond:                                         ; preds = %for.body, %entry
    <badref> = phi <2 x i32> [ undef, %entry ], [ %4, %for.body ]
    %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
    <badref> = and i32 %i.0, 1
    <badref> = zext i32 %1 to i64
    %cmp = icmp eq i32 %i.0, %width
    br i1 %cmp, label %for.cond.cleanup, label %for.body

  for.body:                                         ; preds = %for.cond
    <badref> = extractelement <2 x i32> %0, i64 %2
    %call = tail call i32 @_Z3adji(i32 %3)
    %add = add nsw i32 %call, %3
    <badref> = insertelement <2 x i32> %0, i32 %add, i64 %2
    %inc = add nuw nsw i32 %i.0, 1
    br label %for.cond

  ; Exit blocks
  for.cond.cleanup:                                 ; preds = %for.cond
    ret void
  *** IR Dump After Rotate Loops ***
  ; Preheader:
  for.body.lr.ph:                                   ; preds = %entry
    br label %for.body

  ; Loop:
  for.body:                                         ; preds = %for.body.lr.ph, %for.body
    <badref> = phi i64 [ 0, %for.body.lr.ph ], [ %5, %for.body ]
    %i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
    <badref> = phi <2 x i32> [ undef, %for.body.lr.ph ], [ %3, %for.body ]
    <badref> = extractelement <2 x i32> %1, i64 %0
    %call = tail call i32 @_Z3adji(i32 %2)
    %add = add nsw i32 %call, %2
    <badref> = insertelement <2 x i32> %1, i32 %add, i64 %0
    %inc = add nuw nsw i32 %i.07, 1
    <badref> = and i32 %inc, 1
    <badref> = zext i32 %4 to i64
    %cmp = icmp eq i32 %inc, %width
    br i1 %cmp, label %for.cond.for.cond.cleanup_crit_edge, label %for.body

  ; Exit blocks
  for.cond.for.cond.cleanup_crit_edge:              ; preds = %for.body
    br label %for.cond.cleanup

vs.

  *** IR Dump Before Rotate Loops ***
  ; Preheader:
  entry:
    br label %for.cond

  ; Loop:
  for.cond:                                         ; preds = %for.body, %entry
    <badref> = phi <2 x i32> [ undef, %entry ], [ %2, %for.body ]
    %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
    %cmp = icmp eq i32 %i.0, %width
    br i1 %cmp, label %for.cond.cleanup, label %for.body

  for.body:                                         ; preds = %for.cond
    %rem = and i32 %i.0, 1
    %idxprom = zext i32 %rem to i64
    <badref> = extractelement <2 x i32> %0, i64 %idxprom
    %call = tail call i32 @_Z3adji(i32 %1)
    %add = add nsw i32 %call, %1
    <badref> = insertelement <2 x i32> %0, i32 %add, i64 %idxprom
    %inc = add nuw nsw i32 %i.0, 1
    br label %for.cond

  ; Exit blocks
  for.cond.cleanup:                                 ; preds = %for.cond
    ret void
  *** IR Dump After Rotate Loops ***
  ; Preheader:
  for.body.lr.ph:                                   ; preds = %entry
    br label %for.body

  ; Loop:
  for.body:                                         ; preds = %for.body.lr.ph, %for.body
    %i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
    <badref> = phi <2 x i32> [ undef, %for.body.lr.ph ], [ %2, %for.body ]
    %rem = and i32 %i.07, 1
    %idxprom = zext i32 %rem to i64
    <badref> = extractelement <2 x i32> %0, i64 %idxprom
    %call = tail call i32 @_Z3adji(i32 %1)
    %add = add nsw i32 %call, %1
    <badref> = insertelement <2 x i32> %0, i32 %add, i64 %idxprom
    %inc = add nuw nsw i32 %i.07, 1
    %cmp = icmp eq i32 %inc, %width
    br i1 %cmp, label %for.cond.for.cond.cleanup_crit_edge, label %for.body

  ; Exit blocks
  for.cond.for.cond.cleanup_crit_edge:              ; preds = %for.body
    br label %for.cond.cleanup

I'm not going to argue which result is better, but *normally* there wouldn't be such an `srem`,
so the current outcome is an outlier, and i would think we would prefer to not have such odd outliers.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82721/new/

https://reviews.llvm.org/D82721