[llvm] 6a076fa - [LoopFlatten] Make the analysis more robust after IV widening

Fri Sep 10 04:37:20 PDT 2021

Author: Sjoerd Meijer
Date: 2021-09-10T12:34:04+01:00
New Revision: 6a076fa9539ee6eea078368bc5298e381c1c059e

URL: https://github.com/llvm/llvm-project/commit/6a076fa9539ee6eea078368bc5298e381c1c059e
DIFF: https://github.com/llvm/llvm-project/commit/6a076fa9539ee6eea078368bc5298e381c1c059e.diff

LOG: [LoopFlatten] Make the analysis more robust after IV widening

LoopFlatten wasn't triggering on this motivating case after IV widening:

  void foo(int *A, int N, int M) {
    for (int i = 0; i < N; ++i)
      for (int j = 0; j < M; ++j)
        f(A[i*M+j]);
  }

The reason was that the old induction phi nodes were getting in the way. These
narrow and dead induction phis are not always trivially dead, and having both
the narrow and wide IVs confused the analysis and caused it to bail. This adds
some extra bookkeeping for these old phis, so we can filter them out when
checks on phi nodes are performed. Other clean up passes will get rid of these
old phis and increment instructions.

As this was one of the motivating examples from the beginning, it was
surprising this wasn't triggering from C/C++ code. It looks like the IR and CFG
is just slightly different.

Differential Revision: https://reviews.llvm.org/D109309

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/LoopFlatten.cpp
    llvm/test/Transforms/LoopFlatten/widen-iv.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index 30da7c4d87b7e..83d7899d75bc9 100644

--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -94,6 +94,11 @@ struct FlattenInfo {
   // Whether this holds the flatten info before or after widening.
   bool Widened = false;
 
+  // Holds the old/narrow induction phis, i.e. the Phis before IV widening has
+  // been applied. This bookkeeping is used so we can skip some checks on these
+  // phi nodes.
+  SmallPtrSet<PHINode *, 2> OldInductionPHIs;
+
   FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL) {};
 };
 
@@ -263,6 +268,8 @@ static bool checkPHIs(FlattenInfo &FI, const TargetTransformInfo *TTI) {
     // them specially when doing the transformation.
     if (&InnerPHI == FI.InnerInductionPHI)
       continue;
+    if (FI.Widened && FI.OldInductionPHIs.count(&InnerPHI))
+      continue;
 
     // Each inner loop PHI node must have two incoming values/blocks - one
     // from the pre-header, and one from the latch.
@@ -308,6 +315,8 @@ static bool checkPHIs(FlattenInfo &FI, const TargetTransformInfo *TTI) {
   }
 
   for (PHINode &OuterPHI : FI.OuterLoop->getHeader()->phis()) {
+    if (FI.Widened && FI.OldInductionPHIs.count(&OuterPHI))
+      continue;
     if (!SafeOuterPHIs.count(&OuterPHI)) {
       LLVM_DEBUG(dbgs() << "found unsafe PHI in outer loop: "; OuterPHI.dump());
       return false;
@@ -398,8 +407,8 @@ static bool checkIVUsers(FlattenInfo &FI) {
     if (U == FI.InnerIncrement)
       continue;
 
-    // After widening the IVs, a trunc instruction might have been introduced, so
-    // look through truncs.
+    // After widening the IVs, a trunc instruction might have been introduced,
+    // so look through truncs.
     if (isa<TruncInst>(U)) {
       if (!U->hasOneUse())
         return false;
@@ -424,11 +433,23 @@ static bool checkIVUsers(FlattenInfo &FI) {
 
     // Matches the same pattern as above, except it also looks for truncs
     // on the phi, which can be the result of widening the induction variables.
-    bool IsAddTrunc = match(U, m_c_Add(m_Trunc(m_Specific(FI.InnerInductionPHI)),
-                                       m_Value(MatchedMul))) &&
-                      match(MatchedMul,
-                            m_c_Mul(m_Trunc(m_Specific(FI.OuterInductionPHI)),
-                            m_Value(MatchedItCount)));
+    bool IsAddTrunc =
+        match(U, m_c_Add(m_Trunc(m_Specific(FI.InnerInductionPHI)),
+                         m_Value(MatchedMul))) &&
+        match(MatchedMul, m_c_Mul(m_Trunc(m_Specific(FI.OuterInductionPHI)),
+                                  m_Value(MatchedItCount)));
+
+    if (!MatchedItCount)
+      return false;
+    // Look through extends if the IV has been widened.
+    if (FI.Widened &&
+        (isa<SExtInst>(MatchedItCount) || isa<ZExtInst>(MatchedItCount))) {
+      assert(MatchedItCount->getType() == FI.InnerInductionPHI->getType() &&
+             "Unexpected type mismatch in types after widening");
+      MatchedItCount = isa<SExtInst>(MatchedItCount)
+                           ? dyn_cast<SExtInst>(MatchedItCount)->getOperand(0)
+                           : dyn_cast<ZExtInst>(MatchedItCount)->getOperand(0);
+    }
 
     if ((IsAdd || IsAddTrunc) && MatchedItCount == InnerTripCount) {
       LLVM_DEBUG(dbgs() << "Use is optimisable\n");
@@ -668,14 +689,11 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
   }
 
   SCEVExpander Rewriter(*SE, DL, "loopflatten");
-  SmallVector<WideIVInfo, 2> WideIVs;
   SmallVector<WeakTrackingVH, 4> DeadInsts;
-  WideIVs.push_back( {FI.InnerInductionPHI, MaxLegalType, false });
-  WideIVs.push_back( {FI.OuterInductionPHI, MaxLegalType, false });
   unsigned ElimExt = 0;
   unsigned Widened = 0;
 
-  for (const auto &WideIV : WideIVs) {
+  auto CreateWideIV = [&] (WideIVInfo WideIV, bool &Deleted) -> bool {
     PHINode *WidePhi = createWideIV(WideIV, LI, SE, Rewriter, DT, DeadInsts,
                                     ElimExt, Widened, true /* HasGuards */,
                                     true /* UsePostIncrementRanges */);
@@ -683,11 +701,28 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
       return false;
     LLVM_DEBUG(dbgs() << "Created wide phi: "; WidePhi->dump());
     LLVM_DEBUG(dbgs() << "Deleting old phi: "; WideIV.NarrowIV->dump());
-    RecursivelyDeleteDeadPHINode(WideIV.NarrowIV);
-  }
-  // After widening, rediscover all the loop components.
+    Deleted = RecursivelyDeleteDeadPHINode(WideIV.NarrowIV);
+    return true;
+  };
+
+  bool Deleted;
+  if (!CreateWideIV({FI.InnerInductionPHI, MaxLegalType, false }, Deleted))
+    return false;
+  // If the inner Phi node cannot be trivially deleted, we need to at least
+  // bring it in a consistent state.
+  if (!Deleted)
+    FI.InnerInductionPHI->removeIncomingValue(FI.InnerLoop->getLoopLatch());
+  if (!CreateWideIV({FI.OuterInductionPHI, MaxLegalType, false }, Deleted))
+    return false;
+
   assert(Widened && "Widened IV expected");
   FI.Widened = true;
+
+  // Save the old/narrow induction phis, which we need to ignore in CheckPHIs.
+  FI.OldInductionPHIs.insert(FI.InnerInductionPHI);
+  FI.OldInductionPHIs.insert(FI.OuterInductionPHI);
+
+  // After widening, rediscover all the loop components.
   return CanFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
 }
 

diff  --git a/llvm/test/Transforms/LoopFlatten/widen-iv.ll b/llvm/test/Transforms/LoopFlatten/widen-iv.ll
index abd70138e4c1c..5d0296c834bfa 100644
--- a/llvm/test/Transforms/LoopFlatten/widen-iv.ll
+++ b/llvm/test/Transforms/LoopFlatten/widen-iv.ll
@@ -1,6 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -loop-simplify -loop-flatten -loop-flatten-widen-iv=true -verify-loop-info -verify-dom-info -verify-scev -verify | FileCheck %s --check-prefix=CHECK
-; RUN: opt < %s -S -loop-simplify -loop-flatten -loop-flatten-widen-iv=false -verify-loop-info -verify-dom-info -verify-scev -verify | FileCheck %s --check-prefix=DONTWIDEN
+
+; RUN: opt < %s -S -loop-simplify -loop-flatten -loop-flatten-widen-iv=true \
+; RUN:     -verify-loop-info -verify-dom-info -verify-scev -verify \
+; RUN:     -loop-flatten-cost-threshold=6 | \
+; RUN:     FileCheck %s --check-prefix=CHECK
+
+; RUN: opt < %s -S -loop-simplify -loop-flatten -loop-flatten-widen-iv=false \
+; RUN:     -verify-loop-info -verify-dom-info -verify-scev -verify | \
+; RUN:     FileCheck %s --check-prefix=DONTWIDEN
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
@@ -81,6 +88,228 @@ for.cond.cleanup:
   ret void
 }
 
+; This test case corresponds to this input:
+;
+;     for (int i = 0; i < N; ++i)
+;       for (int j = 0; j < M; ++j)
+;         f(A[i*M+j]);
+;
+; It is very similar to test case @foo above, but the CFG is slightly
+; 
diff erent, making the analysis slightly 
diff erent.
+;
+define void @foo2_sext(i32* nocapture readonly %A, i32 %N, i32 %M) {
+; CHECK-LABEL: @foo2_sext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP17]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond1.preheader.lr.ph:
+; CHECK-NEXT:    [[CMP215:%.*]] = icmp sgt i32 [[M:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP215]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader.preheader:
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader.us.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[M]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[M]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT:    [[FLATTEN_TRIPCOUNT:%.*]] = mul i64 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
+; CHECK:       for.cond1.preheader.us:
+; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i64 [ [[INDVAR_NEXT3:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
+; CHECK-NEXT:    [[I_018_US:%.*]] = phi i32 [ [[INC6_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw i64 [[INDVAR2]], [[TMP1]]
+; CHECK-NEXT:    [[MUL_US:%.*]] = mul nsw i32 [[I_018_US]], [[M]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[MUL_US]] to i64
+; CHECK-NEXT:    [[FLATTEN_TRUNCIV:%.*]] = trunc i64 [[INDVAR2]] to i32
+; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
+; CHECK:       for.body4.us:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ]
+; CHECK-NEXT:    [[J_016_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[INDVAR]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[J_016_US]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw i64 [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[ADD_US:%.*]] = add nsw i32 [[J_016_US]], [[MUL_US]]
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[FLATTEN_TRUNCIV]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVAR2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    tail call void @g(i32 [[TMP8]])
+; CHECK-NEXT:    [[INDVAR_NEXT:%.*]] = add i64 [[INDVAR]], 1
+; CHECK-NEXT:    [[INC_US:%.*]] = add nuw nsw i32 [[J_016_US]], 1
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp slt i64 [[INDVAR_NEXT]], [[TMP0]]
+; CHECK-NEXT:    br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
+; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
+; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
+; CHECK-NEXT:    [[INC6_US]] = add nuw nsw i32 [[I_018_US]], 1
+; CHECK-NEXT:    [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT3]], [[FLATTEN_TRIPCOUNT]]
+; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[I_018:%.*]] = phi i32 [ [[INC6:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[INC6]] = add nuw nsw i32 [[I_018]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC6]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND1_PREHEADER]], label [[FOR_COND_CLEANUP_LOOPEXIT19:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup.loopexit19:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp17 = icmp sgt i32 %N, 0
+  br i1 %cmp17, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup
+
+for.cond1.preheader.lr.ph:
+  %cmp215 = icmp sgt i32 %M, 0
+  br i1 %cmp215, label %for.cond1.preheader.us.preheader, label %for.cond1.preheader.preheader
+
+for.cond1.preheader.preheader:
+  br label %for.cond1.preheader
+
+for.cond1.preheader.us.preheader:
+  br label %for.cond1.preheader.us
+
+for.cond1.preheader.us:
+  %i.018.us = phi i32 [ %inc6.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
+  %mul.us = mul nsw i32 %i.018.us, %M
+  br label %for.body4.us
+
+for.body4.us:
+  %j.016.us = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us, %for.body4.us ]
+  %add.us = add nsw i32 %j.016.us, %mul.us
+  %idxprom.us = sext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, i32* %A, i64 %idxprom.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  tail call void @g(i32 %0)
+  %inc.us = add nuw nsw i32 %j.016.us, 1
+  %cmp2.us = icmp slt i32 %inc.us, %M
+  br i1 %cmp2.us, label %for.body4.us, label %for.cond1.for.cond.cleanup3_crit_edge.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us:
+  %inc6.us = add nuw nsw i32 %i.018.us, 1
+  %cmp.us = icmp slt i32 %inc6.us, %N
+  br i1 %cmp.us, label %for.cond1.preheader.us, label %for.cond.cleanup.loopexit
+
+for.cond1.preheader:
+  %i.018 = phi i32 [ %inc6, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
+  %inc6 = add nuw nsw i32 %i.018, 1
+  %cmp = icmp slt i32 %inc6, %N
+  br i1 %cmp, label %for.cond1.preheader, label %for.cond.cleanup.loopexit19
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup.loopexit19:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; This test case corresponds to this input:
+;
+;  void foo2_zext(unsigned *A, ..) {
+;     for (unsigned i = 0; i < N; ++i)
+;       for (unsigned j = 0; j < M; ++j)
+;         f(A[i*M+j]);
+;
+define void @foo2_zext(i32* nocapture readonly %A, i32 %N, i32 %M) {
+; CHECK-LABEL: @foo2_zext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP17_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP17_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]]
+; CHECK:       for.cond1.preheader.lr.ph:
+; CHECK-NEXT:    [[CMP215_NOT:%.*]] = icmp eq i32 [[M:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP215_NOT]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader.us.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[M]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[FLATTEN_TRIPCOUNT:%.*]] = mul i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
+; CHECK:       for.cond1.preheader.preheader:
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader.us:
+; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i64 [ [[INDVAR_NEXT2:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVAR1]] to i32
+; CHECK-NEXT:    [[MUL_US:%.*]] = mul i32 [[TMP2]], [[M]]
+; CHECK-NEXT:    [[FLATTEN_TRUNCIV:%.*]] = trunc i64 [[INDVAR1]] to i32
+; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
+; CHECK:       for.body4.us:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVAR]] to i32
+; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[TMP3]], [[MUL_US]]
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext i32 [[FLATTEN_TRUNCIV]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    tail call void @g(i32 [[TMP4]])
+; CHECK-NEXT:    [[INDVAR_NEXT:%.*]] = add i64 [[INDVAR]], 1
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ult i64 [[INDVAR_NEXT]], [[TMP0]]
+; CHECK-NEXT:    br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
+; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
+; CHECK-NEXT:    [[INDVAR_NEXT2]] = add i64 [[INDVAR1]], 1
+; CHECK-NEXT:    [[CMP_US:%.*]] = icmp ult i64 [[INDVAR_NEXT2]], [[FLATTEN_TRIPCOUNT]]
+; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT19:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[I_018:%.*]] = phi i32 [ [[INC6:%.*]], [[FOR_COND1_PREHEADER]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[INC6]] = add i32 [[I_018]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC6]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND1_PREHEADER]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup.loopexit19:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp17.not = icmp eq i32 %N, 0
+  br i1 %cmp17.not, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:
+  %cmp215.not = icmp eq i32 %M, 0
+  br i1 %cmp215.not, label %for.cond1.preheader.preheader, label %for.cond1.preheader.us.preheader
+
+for.cond1.preheader.us.preheader:
+  br label %for.cond1.preheader.us
+
+for.cond1.preheader.preheader:
+  br label %for.cond1.preheader
+
+for.cond1.preheader.us:
+  %i.018.us = phi i32 [ %inc6.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
+  %mul.us = mul i32 %i.018.us, %M
+  br label %for.body4.us
+
+for.body4.us:
+  %j.016.us = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us, %for.body4.us ]
+  %add.us = add i32 %j.016.us, %mul.us
+  %idxprom.us = zext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, i32* %A, i64 %idxprom.us
+  %0 = load i32, i32* %arrayidx.us, align 4
+  tail call void @g(i32 %0)
+  %inc.us = add nuw i32 %j.016.us, 1
+  %cmp2.us = icmp ult i32 %inc.us, %M
+  br i1 %cmp2.us, label %for.body4.us, label %for.cond1.for.cond.cleanup3_crit_edge.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us:
+  %inc6.us = add i32 %i.018.us, 1
+  %cmp.us = icmp ult i32 %inc6.us, %N
+  br i1 %cmp.us, label %for.cond1.preheader.us, label %for.cond.cleanup.loopexit19
+
+for.cond1.preheader:
+  %i.018 = phi i32 [ %inc6, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
+  %inc6 = add i32 %i.018, 1
+  %cmp = icmp ult i32 %inc6, %N
+  br i1 %cmp, label %for.cond1.preheader, label %for.cond.cleanup.loopexit
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup.loopexit19:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
 define void @zext(i32 %N, i16* nocapture %A, i16 %val) {
 ; CHECK-LABEL: @zext(
 ; CHECK-NEXT:  entry:
@@ -433,50 +662,6 @@ define void @test4(i16 %n, i16 %m) {
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ;
-; DONTWIDEN-LABEL: @test4(
-; DONTWIDEN-NEXT:  entry:
-; DONTWIDEN-NEXT:    [[CMP38:%.*]] = icmp sgt i16 [[N:%.*]], 0
-; DONTWIDEN-NEXT:    br i1 [[CMP38]], label [[FOR_COND3_PREHEADER_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DONTWIDEN:       for.cond3.preheader.lr.ph:
-; DONTWIDEN-NEXT:    [[CMP636:%.*]] = icmp sgt i16 [[M:%.*]], 0
-; DONTWIDEN-NEXT:    br i1 [[CMP636]], label [[FOR_COND3_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND3_PREHEADER_PREHEADER:%.*]]
-; DONTWIDEN:       for.cond3.preheader.preheader:
-; DONTWIDEN-NEXT:    br label [[FOR_COND3_PREHEADER:%.*]]
-; DONTWIDEN:       for.cond3.preheader.us.preheader:
-; DONTWIDEN-NEXT:    br label [[FOR_COND3_PREHEADER_US:%.*]]
-; DONTWIDEN:       for.cond3.preheader.us:
-; DONTWIDEN-NEXT:    [[I_039_US:%.*]] = phi i16 [ [[INC22_US:%.*]], [[FOR_COND3_FOR_COND_CLEANUP8_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND3_PREHEADER_US_PREHEADER]] ]
-; DONTWIDEN-NEXT:    [[MUL_US:%.*]] = mul i16 [[I_039_US]], [[M]]
-; DONTWIDEN-NEXT:    br label [[FOR_BODY9_US:%.*]]
-; DONTWIDEN:       for.body9.us:
-; DONTWIDEN-NEXT:    [[J_037_US:%.*]] = phi i16 [ 0, [[FOR_COND3_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY9_US]] ]
-; DONTWIDEN-NEXT:    [[ADD_US:%.*]] = add i16 [[J_037_US]], [[MUL_US]]
-; DONTWIDEN-NEXT:    [[CONV14_US:%.*]] = sext i16 [[ADD_US]] to i32
-; DONTWIDEN-NEXT:    [[CALL_US:%.*]] = tail call i32 @use_32(i32 [[CONV14_US]])
-; DONTWIDEN-NEXT:    [[CALL15_US:%.*]] = tail call i32 @use_16(i16 [[ADD_US]])
-; DONTWIDEN-NEXT:    [[CALL17_US:%.*]] = tail call i32 @use_32(i32 [[CONV14_US]])
-; DONTWIDEN-NEXT:    [[CALL18_US:%.*]] = tail call i32 @use_16(i16 [[ADD_US]])
-; DONTWIDEN-NEXT:    [[CONV19_US:%.*]] = sext i16 [[ADD_US]] to i64
-; DONTWIDEN-NEXT:    [[CALL20_US:%.*]] = tail call i32 @use_64(i64 [[CONV19_US]])
-; DONTWIDEN-NEXT:    [[INC_US]] = add nuw nsw i16 [[J_037_US]], 1
-; DONTWIDEN-NEXT:    [[CMP6_US:%.*]] = icmp slt i16 [[INC_US]], [[M]]
-; DONTWIDEN-NEXT:    br i1 [[CMP6_US]], label [[FOR_BODY9_US]], label [[FOR_COND3_FOR_COND_CLEANUP8_CRIT_EDGE_US]]
-; DONTWIDEN:       for.cond3.for.cond.cleanup8_crit_edge.us:
-; DONTWIDEN-NEXT:    [[INC22_US]] = add i16 [[I_039_US]], 1
-; DONTWIDEN-NEXT:    [[CMP_US:%.*]] = icmp slt i16 [[INC22_US]], [[N]]
-; DONTWIDEN-NEXT:    br i1 [[CMP_US]], label [[FOR_COND3_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; DONTWIDEN:       for.cond3.preheader:
-; DONTWIDEN-NEXT:    [[I_039:%.*]] = phi i16 [ [[INC22:%.*]], [[FOR_COND3_PREHEADER]] ], [ 0, [[FOR_COND3_PREHEADER_PREHEADER]] ]
-; DONTWIDEN-NEXT:    [[INC22]] = add i16 [[I_039]], 1
-; DONTWIDEN-NEXT:    [[CMP:%.*]] = icmp slt i16 [[INC22]], [[N]]
-; DONTWIDEN-NEXT:    br i1 [[CMP]], label [[FOR_COND3_PREHEADER]], label [[FOR_COND_CLEANUP_LOOPEXIT1:%.*]]
-; DONTWIDEN:       for.cond.cleanup.loopexit:
-; DONTWIDEN-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DONTWIDEN:       for.cond.cleanup.loopexit1:
-; DONTWIDEN-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DONTWIDEN:       for.cond.cleanup:
-; DONTWIDEN-NEXT:    ret void
-;
 entry:
   %cmp38 = icmp sgt i16 %n, 0
   br i1 %cmp38, label %for.cond3.preheader.lr.ph, label %for.cond.cleanup
@@ -574,5 +759,6 @@ declare void @payload()
 declare dso_local i32 @use_32(i32)
 declare dso_local i32 @use_16(i16)
 declare dso_local i32 @use_64(i64)
+declare dso_local void @g(i32)
 
 declare dso_local void @f(i32* %0) local_unnamed_addr #1