[llvm] 9aa7733 - [LoopFlatten] Widen the IV

Mon Nov 16 02:20:45 PST 2020

Author: Sjoerd Meijer
Date: 2020-11-16T10:20:13Z
New Revision: 9aa773381bd546f205f33e4d91dcfa89526ec0e2

URL: https://github.com/llvm/llvm-project/commit/9aa773381bd546f205f33e4d91dcfa89526ec0e2
DIFF: https://github.com/llvm/llvm-project/commit/9aa773381bd546f205f33e4d91dcfa89526ec0e2.diff

LOG: [LoopFlatten] Widen the IV

Widen the IV to the widest available and legal integer type, which makes this
transformations always safe so that we can skip overflow checks.

Motivation is to let this pass trigger on 64-bit targets too, and this is the
last patch in a serie to achieve this: D90402 moves pass LoopFlatten to just
before IndVarSimplify so that IVs are not already widened, D90421 factors out
widening from IndVarSimplify into Utils/SimplifyIndVar so that we can also use
it in LoopFlatten.

Differential Revision: https://reviews.llvm.org/D90640

Added: 
    llvm/test/Transforms/LoopFlatten/widen-iv.ll

Modified: 
    llvm/lib/Transforms/Scalar/LoopFlatten.cpp
    llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index 6167e2d06ddd..3b247fbf729f 100644

--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -43,7 +43,10 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
 
 #define DEBUG_TYPE "loop-flatten"
 
@@ -61,6 +64,12 @@ static cl::opt<bool>
                      cl::desc("Assume that the product of the two iteration "
                               "limits will never overflow"));
 
+static cl::opt<bool>
+    WidenIV("loop-flatten-widen-iv", cl::Hidden,
+            cl::init(true),
+            cl::desc("Widen the loop induction variables, if possible, so "
+                     "overflow checks won't reject flattening"));
+
 struct FlattenInfo {
   Loop *OuterLoop = nullptr;
   Loop *InnerLoop = nullptr;
@@ -254,6 +263,7 @@ static bool checkPHIs(struct FlattenInfo &FI,
     }
   }
 
+  LLVM_DEBUG(dbgs() << "checkPHIs: OK\n");
   return true;
 }
 
@@ -306,9 +316,12 @@ checkOuterLoopInsts(struct FlattenInfo &FI,
                     << RepeatedInstrCost << "\n");
   // Bail out if flattening the loops would cause instructions in the outer
   // loop but not in the inner loop to be executed extra times.
-  if (RepeatedInstrCost > RepeatedInstructionThreshold)
+  if (RepeatedInstrCost > RepeatedInstructionThreshold) {
+    LLVM_DEBUG(dbgs() << "checkOuterLoopInsts: not profitable, bailing.\n");
     return false;
+  }
 
+  LLVM_DEBUG(dbgs() << "checkOuterLoopInsts: OK\n");
   return true;
 }
 
@@ -321,6 +334,10 @@ static bool checkIVUsers(struct FlattenInfo &FI) {
   // require a div/mod to reconstruct in the flattened loop, so the
   // transformation wouldn't be profitable.
 
+  Value *InnerLimit = FI.InnerLimit;
+  if (auto *I = dyn_cast<SExtInst>(InnerLimit))
+    InnerLimit = I->getOperand(0);
+
   // Check that all uses of the inner loop's induction variable match the
   // expected pattern, recording the uses of the outer IV.
   SmallPtrSet<Value *, 4> ValidOuterPHIUses;
@@ -328,15 +345,32 @@ static bool checkIVUsers(struct FlattenInfo &FI) {
     if (U == FI.InnerIncrement)
       continue;
 
+    // After widening the IVs, a trunc instruction might have been introduced, so
+    // look through truncs.
+    if (dyn_cast<TruncInst>(U) ) {
+      if (!U->hasOneUse())
+        return false;
+      U = *U->user_begin();
+    }
+
     LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump());
 
-    Value *MatchedMul, *MatchedItCount;
-    if (match(U, m_c_Add(m_Specific(FI.InnerInductionPHI),
-                         m_Value(MatchedMul))) &&
-        match(MatchedMul,
-              m_c_Mul(m_Specific(FI.OuterInductionPHI),
-                      m_Value(MatchedItCount))) &&
-        MatchedItCount == FI.InnerLimit) {
+    Value *MatchedMul;
+    Value *MatchedItCount;
+    bool IsAdd = match(U, m_c_Add(m_Specific(FI.InnerInductionPHI),
+                                  m_Value(MatchedMul))) &&
+                 match(MatchedMul, m_c_Mul(m_Specific(FI.OuterInductionPHI),
+                                           m_Value(MatchedItCount)));
+
+    // Matches the same pattern as above, except it also looks for truncs
+    // on the phi, which can be the result of widening the induction variables.
+    bool IsAddTrunc = match(U, m_c_Add(m_Trunc(m_Specific(FI.InnerInductionPHI)),
+                                       m_Value(MatchedMul))) &&
+                      match(MatchedMul,
+                            m_c_Mul(m_Trunc(m_Specific(FI.OuterInductionPHI)),
+                            m_Value(MatchedItCount)));
+
+    if ((IsAdd || IsAddTrunc) && MatchedItCount == InnerLimit) {
       LLVM_DEBUG(dbgs() << "Use is optimisable\n");
       ValidOuterPHIUses.insert(MatchedMul);
       FI.LinearIVUses.insert(U);
@@ -352,23 +386,35 @@ static bool checkIVUsers(struct FlattenInfo &FI) {
     if (U == FI.OuterIncrement)
       continue;
 
-    LLVM_DEBUG(dbgs() << "Found use of outer induction variable: "; U->dump());
-
-    if (!ValidOuterPHIUses.count(U)) {
-      LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
-      return false;
-    } else {
+    auto IsValidOuterPHIUses = [&] (User *U) -> bool {
+      LLVM_DEBUG(dbgs() << "Found use of outer induction variable: "; U->dump());
+      if (!ValidOuterPHIUses.count(U)) {
+        LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
+        return false;
+      }
       LLVM_DEBUG(dbgs() << "Use is optimisable\n");
+      return true;
+    };
+
+    if (auto *V = dyn_cast<TruncInst>(U)) {
+      for (auto *K : V->users()) {
+        if (!IsValidOuterPHIUses(K))
+          return false;
+      }
+      continue;
     }
+
+    if (!IsValidOuterPHIUses(U))
+      return false;
   }
 
-  LLVM_DEBUG(dbgs() << "Found " << FI.LinearIVUses.size()
+  LLVM_DEBUG(dbgs() << "checkIVUsers: OK\n";
+             dbgs() << "Found " << FI.LinearIVUses.size()
                     << " value(s) that can be replaced:\n";
              for (Value *V : FI.LinearIVUses) {
                dbgs() << "  ";
                V->dump();
              });
-
   return true;
 }
 
@@ -413,15 +459,9 @@ static OverflowResult checkOverflow(struct FlattenInfo &FI,
   return OverflowResult::MayOverflow;
 }
 
-static bool FlattenLoopPair(struct FlattenInfo &FI, DominatorTree *DT,
-                            LoopInfo *LI, ScalarEvolution *SE,
-                            AssumptionCache *AC, TargetTransformInfo *TTI) {
-  Function *F = FI.OuterLoop->getHeader()->getParent();
-  LLVM_DEBUG(dbgs() << "Loop flattening running on outer loop "
-                    << FI.OuterLoop->getHeader()->getName() << " and inner loop "
-                    << FI.InnerLoop->getHeader()->getName() << " in "
-                    << F->getName() << "\n");
-
+static bool CanFlattenLoopPair(struct FlattenInfo &FI, DominatorTree *DT,
+                               LoopInfo *LI, ScalarEvolution *SE,
+                               AssumptionCache *AC, const TargetTransformInfo *TTI) {
   SmallPtrSet<Instruction *, 8> IterationInstructions;
   if (!findLoopComponents(FI.InnerLoop, IterationInstructions, FI.InnerInductionPHI,
                           FI.InnerLimit, FI.InnerIncrement, FI.InnerBranch, SE))
@@ -459,32 +499,16 @@ static bool FlattenLoopPair(struct FlattenInfo &FI, DominatorTree *DT,
   if (!checkIVUsers(FI))
     return false;
 
-  // Check if the new iteration variable might overflow. In this case, we
-  // need to version the loop, and select the original version at runtime if
-  // the iteration space is too large.
-  // TODO: We currently don't version the loop.
-  // TODO: it might be worth using a wider iteration variable rather than
-  // versioning the loop, if a wide enough type is legal.
-  bool MustVersionLoop = true;
-  OverflowResult OR = checkOverflow(FI, DT, AC);
-  if (OR == OverflowResult::AlwaysOverflowsHigh ||
-      OR == OverflowResult::AlwaysOverflowsLow) {
-    LLVM_DEBUG(dbgs() << "Multiply would always overflow, so not profitable\n");
-    return false;
-  } else if (OR == OverflowResult::MayOverflow) {
-    LLVM_DEBUG(dbgs() << "Multiply might overflow, not flattening\n");
-  } else {
-    LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n");
-    MustVersionLoop = false;
-  }
-
-  // We cannot safely flatten the loop. Exit now.
-  if (MustVersionLoop)
-    return false;
+  LLVM_DEBUG(dbgs() << "CanFlattenLoopPair: OK\n");
+  return true;
+}
 
-  // Do the actual transformation.
+static bool DoFlattenLoopPair(struct FlattenInfo &FI, DominatorTree *DT,
+                              LoopInfo *LI, ScalarEvolution *SE,
+                              AssumptionCache *AC,
+                              const TargetTransformInfo *TTI) {
+  Function *F = FI.OuterLoop->getHeader()->getParent();
   LLVM_DEBUG(dbgs() << "Checks all passed, doing the transformation\n");
-
   {
     using namespace ore;
     OptimizationRemark Remark(DEBUG_TYPE, "Flattened", FI.InnerLoop->getStartLoc(),
@@ -503,6 +527,9 @@ static bool FlattenLoopPair(struct FlattenInfo &FI, DominatorTree *DT,
   // Fix up PHI nodes that take values from the inner loop back-edge, which
   // we are about to remove.
   FI.InnerInductionPHI->removeIncomingValue(FI.InnerLoop->getLoopLatch());
+
+  // The old Phi will be optimised away later, but for now we can't leave
+  // leave it in an invalid state, so are updating them too.
   for (PHINode *PHI : FI.InnerPHIsToTransform)
     PHI->removeIncomingValue(FI.InnerLoop->getLoopLatch());
 
@@ -517,10 +544,21 @@ static bool FlattenLoopPair(struct FlattenInfo &FI, DominatorTree *DT,
   BranchInst::Create(InnerExitBlock, InnerExitingBlock);
   DT->deleteEdge(InnerExitingBlock, FI.InnerLoop->getHeader());
 
+  auto HasSExtUser = [] (Value *V) -> Value * {
+    for (User *U : V->users() )
+      if (dyn_cast<SExtInst>(U))
+        return U;
+    return nullptr;
+  };
+
   // Replace all uses of the polynomial calculated from the two induction
   // variables with the one new one.
-  for (Value *V : FI.LinearIVUses)
+  for (Value *V : FI.LinearIVUses) {
+    // If the induction variable has been widened, look through the SExt.
+    if (Value *U = HasSExtUser(V))
+      V = U;
     V->replaceAllUsesWith(FI.OuterInductionPHI);
+  }
 
   // Tell LoopInfo, SCEV and the pass manager that the inner loop has been
   // deleted, and any information that have about the outer loop invalidated.
@@ -530,6 +568,89 @@ static bool FlattenLoopPair(struct FlattenInfo &FI, DominatorTree *DT,
   return true;
 }
 
+static bool CanWidenIV(struct FlattenInfo &FI, DominatorTree *DT,
+                       LoopInfo *LI, ScalarEvolution *SE,
+                       AssumptionCache *AC, const TargetTransformInfo *TTI) {
+  if (!WidenIV) {
+    LLVM_DEBUG(dbgs() << "Widening the IVs is disabled\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Try widening the IVs\n");
+  Module *M = FI.InnerLoop->getHeader()->getParent()->getParent();
+  auto &DL = M->getDataLayout();
+  auto *InnerType = FI.InnerInductionPHI->getType();
+  auto *OuterType = FI.OuterInductionPHI->getType();
+  unsigned MaxLegalSize = DL.getLargestLegalIntTypeSizeInBits();
+  auto *MaxLegalType = DL.getLargestLegalIntType(M->getContext());
+
+  // If both induction types are less than the maximum legal integer width,
+  // promote both to the widest type available so we know calculating
+  // (OuterLimit * InnerLimit) as the new trip count is safe.
+  if (InnerType != OuterType ||
+      InnerType->getScalarSizeInBits() >= MaxLegalSize ||
+      MaxLegalType->getScalarSizeInBits() < InnerType->getScalarSizeInBits() * 2) {
+    LLVM_DEBUG(dbgs() << "Can't widen the IV\n");
+    return false;
+  }
+
+  SCEVExpander Rewriter(*SE, DL, "loopflatten");
+  SmallVector<WideIVInfo, 2> WideIVs;
+  SmallVector<WeakTrackingVH, 4> DeadInsts;
+  WideIVs.push_back( {FI.InnerInductionPHI, MaxLegalType, false });
+  WideIVs.push_back( {FI.OuterInductionPHI, MaxLegalType, false });
+  unsigned ElimExt;
+  unsigned Widened;
+
+  for (unsigned i = 0; i < WideIVs.size(); i++) {
+    PHINode *WidePhi = createWideIV(WideIVs[i], LI, SE, Rewriter, DT, DeadInsts,
+                                    ElimExt, Widened, true /* HasGuards */,
+                                    true /* UsePostIncrementRanges */);
+    if (!WidePhi)
+      return false;
+    LLVM_DEBUG(dbgs() << "Created wide phi: "; WidePhi->dump());
+    LLVM_DEBUG(dbgs() << "Deleting old phi: "; WideIVs[i].NarrowIV->dump());
+    RecursivelyDeleteDeadPHINode(WideIVs[i].NarrowIV);
+  }
+  // After widening, rediscover all the loop components.
+  return CanFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
+}
+
+static bool FlattenLoopPair(struct FlattenInfo &FI, DominatorTree *DT,
+                            LoopInfo *LI, ScalarEvolution *SE,
+                            AssumptionCache *AC,
+                            const TargetTransformInfo *TTI) {
+  Function *F = FI.OuterLoop->getHeader()->getParent();
+  LLVM_DEBUG(dbgs() << "Loop flattening running on outer loop "
+                    << FI.OuterLoop->getHeader()->getName() << " and inner loop "
+                    << FI.InnerLoop->getHeader()->getName() << " in "
+                    << F->getName() << "\n");
+
+  if (!CanFlattenLoopPair(FI, DT, LI, SE, AC, TTI))
+    return false;
+
+  // Check if we can widen the induction variables to avoid overflow checks.
+  if (CanWidenIV(FI, DT, LI, SE, AC, TTI))
+    return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
+
+  // Check if the new iteration variable might overflow. In this case, we
+  // need to version the loop, and select the original version at runtime if
+  // the iteration space is too large.
+  // TODO: We currently don't version the loop.
+  OverflowResult OR = checkOverflow(FI, DT, AC);
+  if (OR == OverflowResult::AlwaysOverflowsHigh ||
+      OR == OverflowResult::AlwaysOverflowsLow) {
+    LLVM_DEBUG(dbgs() << "Multiply would always overflow, so not profitable\n");
+    return false;
+  } else if (OR == OverflowResult::MayOverflow) {
+    LLVM_DEBUG(dbgs() << "Multiply might overflow, not flattening\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n");
+  return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
+}
+
 bool Flatten(DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE,
              AssumptionCache *AC, TargetTransformInfo *TTI) {
   bool Changed = false;

diff  --git a/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll b/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll
index ca7cbd42468f..c563078e25da 100644
--- a/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll
+++ b/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll
@@ -1,6 +1,8 @@
 ; RUN: opt < %s -S -loop-flatten -debug-only=loop-flatten 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
 ; Every function in this file has a reason that it can't be transformed.
 
 ; CHECK-NOT: Checks all passed, doing the transformation

diff  --git a/llvm/test/Transforms/LoopFlatten/widen-iv.ll b/llvm/test/Transforms/LoopFlatten/widen-iv.ll
new file mode 100644
index 000000000000..bb3fa3360d0e
--- /dev/null
+++ b/llvm/test/Transforms/LoopFlatten/widen-iv.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -loop-flatten -verify-loop-info -verify-dom-info -verify-scev -verify | FileCheck %s --check-prefix=CHECK
+; RUN: opt < %s -S -loop-flatten -loop-flatten-widen-iv=false -verify-loop-info -verify-dom-info -verify-scev -verify | FileCheck %s --check-prefix=DONTWIDEN
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; Function Attrs: nounwind
+define void @foo(i32* %A, i32 %N, i32 %M) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP17]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond1.preheader.lr.ph:
+; CHECK-NEXT:    [[CMP215:%.*]] = icmp sgt i32 [[M:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP215]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond1.preheader.us.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[M]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT:    [[FLATTEN_TRIPCOUNT:%.*]] = mul i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
+; CHECK:       for.cond1.preheader.us:
+; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i64 [ [[INDVAR_NEXT2:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVAR1]] to i32
+; CHECK-NEXT:    [[MUL_US:%.*]] = mul nsw i32 [[TMP2]], [[M]]
+; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
+; CHECK:       for.body4.us:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVAR]] to i32
+; CHECK-NEXT:    [[ADD_US:%.*]] = add nsw i32 [[TMP3]], [[MUL_US]]
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVAR1]]
+; CHECK-NEXT:    tail call void @f(i32* [[ARRAYIDX_US]])
+; CHECK-NEXT:    [[INDVAR_NEXT:%.*]] = add i64 [[INDVAR]], 1
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp slt i64 [[INDVAR_NEXT]], [[TMP0]]
+; CHECK-NEXT:    br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
+; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
+; CHECK-NEXT:    [[INDVAR_NEXT2]] = add i64 [[INDVAR1]], 1
+; CHECK-NEXT:    [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT2]], [[FLATTEN_TRIPCOUNT]]
+; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; DONTWIDEN-LABEL: @foo(
+; DONTWIDEN-NEXT:  entry:
+; DONTWIDEN-NEXT:    [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DONTWIDEN-NEXT:    br i1 [[CMP17]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DONTWIDEN:       for.cond1.preheader.lr.ph:
+; DONTWIDEN-NEXT:    [[CMP215:%.*]] = icmp sgt i32 [[M:%.*]], 0
+; DONTWIDEN-NEXT:    br i1 [[CMP215]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND_CLEANUP]]
+; DONTWIDEN:       for.cond1.preheader.us.preheader:
+; DONTWIDEN-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
+; DONTWIDEN:       for.cond1.preheader.us:
+; DONTWIDEN-NEXT:    [[I_018_US:%.*]] = phi i32 [ [[INC6_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
+; DONTWIDEN-NEXT:    [[MUL_US:%.*]] = mul nsw i32 [[I_018_US]], [[M]]
+; DONTWIDEN-NEXT:    br label [[FOR_BODY4_US:%.*]]
+; DONTWIDEN:       for.body4.us:
+; DONTWIDEN-NEXT:    [[J_016_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY4_US]] ]
+; DONTWIDEN-NEXT:    [[ADD_US:%.*]] = add nsw i32 [[J_016_US]], [[MUL_US]]
+; DONTWIDEN-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64
+; DONTWIDEN-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]]
+; DONTWIDEN-NEXT:    tail call void @f(i32* [[ARRAYIDX_US]])
+; DONTWIDEN-NEXT:    [[INC_US]] = add nuw nsw i32 [[J_016_US]], 1
+; DONTWIDEN-NEXT:    [[CMP2_US:%.*]] = icmp slt i32 [[INC_US]], [[M]]
+; DONTWIDEN-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY4_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
+; DONTWIDEN:       for.cond1.for.cond.cleanup3_crit_edge.us:
+; DONTWIDEN-NEXT:    [[INC6_US]] = add nuw nsw i32 [[I_018_US]], 1
+; DONTWIDEN-NEXT:    [[CMP_US:%.*]] = icmp slt i32 [[INC6_US]], [[N]]
+; DONTWIDEN-NEXT:    br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; DONTWIDEN:       for.cond.cleanup.loopexit:
+; DONTWIDEN-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DONTWIDEN:       for.cond.cleanup:
+; DONTWIDEN-NEXT:    ret void
+;
+entry:
+  %cmp17 = icmp sgt i32 %N, 0
+  br i1 %cmp17, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup
+
+for.cond1.preheader.lr.ph:
+  %cmp215 = icmp sgt i32 %M, 0
+  br i1 %cmp215, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup
+
+for.cond1.preheader.us.preheader:
+  br label %for.cond1.preheader.us
+
+for.cond1.preheader.us:
+  %i.018.us = phi i32 [ %inc6.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
+  %mul.us = mul nsw i32 %i.018.us, %M
+  br label %for.body4.us
+
+for.body4.us:
+  %j.016.us = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us, %for.body4.us ]
+  %add.us = add nsw i32 %j.016.us, %mul.us
+  %idxprom.us = sext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i32, i32* %A, i64 %idxprom.us
+  tail call void @f(i32* %arrayidx.us) #2
+  %inc.us = add nuw nsw i32 %j.016.us, 1
+  %cmp2.us = icmp slt i32 %inc.us, %M
+  br i1 %cmp2.us, label %for.body4.us, label %for.cond1.for.cond.cleanup3_crit_edge.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us:
+  %inc6.us = add nuw nsw i32 %i.018.us, 1
+  %cmp.us = icmp slt i32 %inc6.us, %N
+  br i1 %cmp.us, label %for.cond1.preheader.us, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+declare dso_local void @f(i32* %0) local_unnamed_addr #1