[llvm-dev] [RFC] New pass: LoopExitValues

Sat Sep 26 16:28:14 PDT 2015

Hi Steve,

Do you primarily find this to help for nested loops?  If so, that
could be because LSR explicitly bails out of processing them:

   // Skip nested loops until we can model them better with formulae.
   if (!L->empty()) {
     DEBUG(dbgs() << "LSR skipping outer loop " << *L << "n");
     return;
   }

I don't know how much time you're willing to commit to this, but
perhaps a more principled fix is to change LSR to actually work with
nested loops?

If I comment out this change, after LSR the matric_mul routine does
not actually look any better (possibly even worse):

define void @matrix_mul(i32 %Size, i32* nocapture %Dst, i32* nocapture readonly %Src, i32 %Val) {
entry:
   %Src12 = bitcast i32* %Src to i8*
   %Dst14 = bitcast i32* %Dst to i8*
   %cmp.25 = icmp eq i32 %Size, 0
   br i1 %cmp.25, label %for.cond.cleanup, label %for.body.4.lr.ph.preheader

for.body.4.lr.ph.preheader:                       ; preds = %entry
   %0 = shl i32 %Size, 2
   br label %for.body.4.lr.ph

for.body.4.lr.ph:                                 ; preds = %for.body.4.lr.ph.preheader, %for.cond.cleanup.3
   %lsr.iv17 = phi i32 [ %Size, %for.body.4.lr.ph.preheader ], [ %lsr.iv.next18, %for.cond.cleanup.3 ]
   %lsr.iv10 = phi i32 [ 0, %for.body.4.lr.ph.preheader ], [ %lsr.iv.next11, %for.cond.cleanup.3 ]
   %uglygep = getelementptr i8, i8* %Src12, i32 %lsr.iv10
   %uglygep13 = bitcast i8* %uglygep to i32*
   %uglygep15 = getelementptr i8, i8* %Dst14, i32 %lsr.iv10
   %uglygep1516 = bitcast i8* %uglygep15 to i32*
   br label %for.body.4

for.body.4:                                       ; preds = %for.body.4, %for.body.4.lr.ph
   %lsr.iv8 = phi i32* [ %scevgep9, %for.body.4 ], [ %uglygep13, %for.body.4.lr.ph ]
   %lsr.iv3 = phi i32* [ %scevgep4, %for.body.4 ], [ %uglygep1516, %for.body.4.lr.ph ]
   %lsr.iv = phi i32 [ %lsr.iv.next, %for.body.4 ], [ %Size, %for.body.4.lr.ph ]
   %1 = load i32, i32* %lsr.iv8, align 4, !tbaa !0
   %mul5 = mul i32 %1, %Val
   store i32 %mul5, i32* %lsr.iv3, align 4, !tbaa !0
   %lsr.iv.next = add i32 %lsr.iv, -1
   %scevgep4 = getelementptr i32, i32* %lsr.iv3, i32 1
   %scevgep9 = getelementptr i32, i32* %lsr.iv8, i32 1
   %exitcond = icmp eq i32 %lsr.iv.next, 0
   br i1 %exitcond, label %for.cond.cleanup.3, label %for.body.4

for.cond.cleanup.3:                               ; preds = %for.body.4
   %lsr.iv.next11 = add i32 %lsr.iv10, %0
   %lsr.iv.next18 = add i32 %lsr.iv17, -1
   %exitcond27 = icmp eq i32 %lsr.iv.next18, 0
   br i1 %exitcond27, label %for.cond.cleanup.loopexit, label %for.body.4.lr.ph

for.cond.cleanup.loopexit:                        ; preds = %for.cond.cleanup.3
   br label %for.cond.cleanup

for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
   ret void
}

-- Sanjoy