[PATCH] D134930: [LoopInterchange] Do not interchange when a reduction phi in all subloops of the outer loop is not recognizable

Mon Oct 31 19:10:16 PDT 2022

congzhe added a subscriber: uabelho.
congzhe added a comment.

In D134930#3867354 <https://reviews.llvm.org/D134930#3867354>, @Meinersbur wrote:

> LGTM. Thank you.
>
> The additional check make sense.

Thank you Michael @Meinersbur for the review! I've updated the patch slightly to address your comments.

Hi Bardia @bmahjour, I remember you wanted to take a look at the problematic IR if we interchange the loopnest in `multilevel-partial-reduction.ll` (the test file in this patch). Here it is - if we interchange the outermost and the middle loop, after interchange the IR becomes the one as shown below. Like @uabelho said in https://reviews.llvm.org/D132055#3814831, before interchange we read `arrayidx14.promoted.i = load i32, ptr %arrayidx14.i, align 1`, then do 512 rounds of the inner loop and add the read elements, and then we store what we have so far in `store i32 %18, ptr %arrayidx14.i, align 1`: But after interchange we load, then add, but then we dont do the store, but do the load and execute the inner loop again, so we throw away the calculated addition and just read the same values again. I'd appreciate it if you have further comments on it.

  define i32 @test7() {
  entry:
    br label %for.cond4.preheader.i.preheader

  for.cond1.preheader.i.preheader:                  ; preds = %for.cond4.preheader.i
    br label %for.cond1.preheader.i

  for.cond1.preheader.i:                            ; preds = %for.cond1.preheader.i.preheader, %for.inc19.i
    %i.011.i = phi i16 [ %inc20.i, %for.inc19.i ], [ 0, %for.cond1.preheader.i.preheader ]
    br label %for.cond4.preheader.i.split

  for.cond4.preheader.i.preheader:                  ; preds = %entry
    br label %for.cond4.preheader.i

  for.cond4.preheader.i:                            ; preds = %for.cond4.preheader.i.preheader, %middle.block
    %j.010.i = phi i16 [ %inc17.i, %middle.block ], [ 0, %for.cond4.preheader.i.preheader ]
    br label %for.cond1.preheader.i.preheader

  for.cond4.preheader.i.split:                      ; preds = %for.cond1.preheader.i
    %arrayidx14.i = getelementptr inbounds [2 x [4 x i32]], ptr @c, i16 0, i16 %i.011.i, i16 %j.010.i
    %arrayidx14.promoted.i = load i32, ptr %arrayidx14.i, align 1
    %0 = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 %arrayidx14.promoted.i, i64 0
    br label %vector.body

  vector.body:                                      ; preds = %vector.body, %for.cond4.preheader.i.split
    %index = phi i16 [ 0, %for.cond4.preheader.i.split ], [ %index.next, %vector.body ]
    %vec.phi = phi <4 x i32> [ %0, %for.cond4.preheader.i.split ], [ %16, %vector.body ]
    %1 = or i16 %index, 1
    %2 = or i16 %index, 2
    %3 = or i16 %index, 3
    %4 = getelementptr inbounds [512 x [4 x i32]], ptr @b, i16 0, i16 %index, i16 %j.010.i
    %5 = getelementptr inbounds [512 x [4 x i32]], ptr @b, i16 0, i16 %1, i16 %j.010.i
    %6 = getelementptr inbounds [512 x [4 x i32]], ptr @b, i16 0, i16 %2, i16 %j.010.i
    %7 = getelementptr inbounds [512 x [4 x i32]], ptr @b, i16 0, i16 %3, i16 %j.010.i
    %8 = load i32, ptr %4, align 1
    %9 = load i32, ptr %5, align 1
    %10 = load i32, ptr %6, align 1
    %11 = load i32, ptr %7, align 1
    %12 = insertelement <4 x i32> poison, i32 %8, i64 0
    %13 = insertelement <4 x i32> %12, i32 %9, i64 1
    %14 = insertelement <4 x i32> %13, i32 %10, i64 2
    %15 = insertelement <4 x i32> %14, i32 %11, i64 3
    %16 = add <4 x i32> %15, %vec.phi
    %index.next = add nuw i16 %index, 4
    %17 = icmp eq i16 %index.next, 512
    br i1 %17, label %for.inc19.i, label %vector.body

  middle.block:                                     ; preds = %for.inc19.i
    %.lcssa.lcssa = phi <4 x i32> [ %.lcssa, %for.inc19.i ]
    %arrayidx14.i.lcssa = phi ptr [ %arrayidx14.i, %for.inc19.i ]
    %18 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %.lcssa.lcssa)
    store i32 %18, ptr %arrayidx14.i.lcssa, align 1
    %inc17.i = add nuw nsw i16 %j.010.i, 1
    %exitcond12.not.i = icmp eq i16 %inc17.i, 4
    br i1 %exitcond12.not.i, label %test.exit, label %for.cond4.preheader.i

  for.inc19.i:                                      ; preds = %vector.body
    %.lcssa = phi <4 x i32> [ %16, %vector.body ]
    %inc20.i = add nuw nsw i16 %i.011.i, 1
    %exitcond13.not.i = icmp eq i16 %inc20.i, 2
    br i1 %exitcond13.not.i, label %middle.block, label %for.cond1.preheader.i

  test.exit:                                        ; preds = %middle.block
    %19 = load i32, ptr @c, align 1
    ret i32 %19
  }

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D134930/new/

https://reviews.llvm.org/D134930