[llvm-branch-commits] [llvm] 33b2c88 - [LoopFlatten] Widen IV, support ZExt.

Mon Nov 23 01:02:00 PST 2020

Author: Sjoerd Meijer
Date: 2020-11-23T08:57:19Z
New Revision: 33b2c88fa8223dbf15846ce18cc957e33e0d67fc

URL: https://github.com/llvm/llvm-project/commit/33b2c88fa8223dbf15846ce18cc957e33e0d67fc
DIFF: https://github.com/llvm/llvm-project/commit/33b2c88fa8223dbf15846ce18cc957e33e0d67fc.diff

LOG: [LoopFlatten] Widen IV, support ZExt.

I disabled the widening in fa5cb4b because it run in an assert, which was
related to replacing values with different types. I forgot that an extend could
also be a zero-extend, which I have added now. This means that the approach now
is to create and insert a trunc value of the outerloop for each user, and use
that to replace IV values.

Differential Revision: https://reviews.llvm.org/D91690

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/LoopFlatten.cpp
    llvm/test/Transforms/LoopFlatten/widen-iv.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index 3d9617d43aea..aaff68436c13 100644

--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Verifier.h"
@@ -66,7 +67,7 @@ static cl::opt<bool>
 
 static cl::opt<bool>
     WidenIV("loop-flatten-widen-iv", cl::Hidden,
-            cl::init(false),
+            cl::init(true),
             cl::desc("Widen the loop induction variables, if possible, so "
                      "overflow checks won't reject flattening"));
 
@@ -84,6 +85,9 @@ struct FlattenInfo {
   SmallPtrSet<Value *, 4> LinearIVUses;
   SmallPtrSet<PHINode *, 4> InnerPHIsToTransform;
 
+  // Whether this holds the flatten info before or after widening.
+  bool Widened = false;
+
   FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL) {};
 };
 
@@ -335,8 +339,9 @@ static bool checkIVUsers(struct FlattenInfo &FI) {
   // transformation wouldn't be profitable.
 
   Value *InnerLimit = FI.InnerLimit;
-  if (auto *I = dyn_cast<SExtInst>(InnerLimit))
-    InnerLimit = I->getOperand(0);
+  if (FI.Widened &&
+      (isa<SExtInst>(InnerLimit) || isa<ZExtInst>(InnerLimit)))
+    InnerLimit = cast<Instruction>(InnerLimit)->getOperand(0);
 
   // Check that all uses of the inner loop's induction variable match the
   // expected pattern, recording the uses of the outer IV.
@@ -347,7 +352,7 @@ static bool checkIVUsers(struct FlattenInfo &FI) {
 
     // After widening the IVs, a trunc instruction might have been introduced, so
     // look through truncs.
-    if (dyn_cast<TruncInst>(U) ) {
+    if (isa<TruncInst>(U)) {
       if (!U->hasOneUse())
         return false;
       U = *U->user_begin();
@@ -544,20 +549,18 @@ static bool DoFlattenLoopPair(struct FlattenInfo &FI, DominatorTree *DT,
   BranchInst::Create(InnerExitBlock, InnerExitingBlock);
   DT->deleteEdge(InnerExitingBlock, FI.InnerLoop->getHeader());
 
-  auto HasSExtUser = [] (Value *V) -> Value * {
-    for (User *U : V->users() )
-      if (dyn_cast<SExtInst>(U))
-        return U;
-    return nullptr;
-  };
-
   // Replace all uses of the polynomial calculated from the two induction
   // variables with the one new one.
+  IRBuilder<> Builder(FI.OuterInductionPHI->getParent()->getTerminator());
   for (Value *V : FI.LinearIVUses) {
-    // If the induction variable has been widened, look through the SExt.
-    if (Value *U = HasSExtUser(V))
-      V = U;
-    V->replaceAllUsesWith(FI.OuterInductionPHI);
+    Value *OuterValue = FI.OuterInductionPHI;
+    if (FI.Widened)
+      OuterValue = Builder.CreateTrunc(FI.OuterInductionPHI, V->getType(),
+                                       "flatten.trunciv");
+
+    LLVM_DEBUG(dbgs() << "Replacing: "; V->dump();
+               dbgs() << "with:      "; OuterValue->dump());
+    V->replaceAllUsesWith(OuterValue);
   }
 
   // Tell LoopInfo, SCEV and the pass manager that the inner loop has been
@@ -613,6 +616,8 @@ static bool CanWidenIV(struct FlattenInfo &FI, DominatorTree *DT,
     RecursivelyDeleteDeadPHINode(WideIVs[i].NarrowIV);
   }
   // After widening, rediscover all the loop components.
+  assert(Widened && "Widenend IV expected");
+  FI.Widened = true;
   return CanFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
 }
 

diff  --git a/llvm/test/Transforms/LoopFlatten/widen-iv.ll b/llvm/test/Transforms/LoopFlatten/widen-iv.ll
index 579061833bf4..9ac9215a8d95 100644
--- a/llvm/test/Transforms/LoopFlatten/widen-iv.ll
+++ b/llvm/test/Transforms/LoopFlatten/widen-iv.ll
@@ -4,6 +4,9 @@
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
+; DONTWIDEN-NOT:   %flatten.tripcount
+; DONTWIDEN-NOT:   %flatten.trunciv
+
 ; Function Attrs: nounwind
 define void @foo(i32* %A, i32 %N, i32 %M) {
 ; CHECK-LABEL: @foo(
@@ -22,13 +25,14 @@ define void @foo(i32* %A, i32 %N, i32 %M) {
 ; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i64 [ [[INDVAR_NEXT2:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVAR1]] to i32
 ; CHECK-NEXT:    [[MUL_US:%.*]] = mul nsw i32 [[TMP2]], [[M]]
+; CHECK-NEXT:    [[FLATTEN_TRUNCIV:%.*]] = trunc i64 [[INDVAR1]] to i32
 ; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
 ; CHECK:       for.body4.us:
 ; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVAR]] to i32
 ; CHECK-NEXT:    [[ADD_US:%.*]] = add nsw i32 [[TMP3]], [[MUL_US]]
-; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVAR1]]
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[FLATTEN_TRUNCIV]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]]
 ; CHECK-NEXT:    tail call void @f(i32* [[ARRAYIDX_US]])
 ; CHECK-NEXT:    [[INDVAR_NEXT:%.*]] = add i64 [[INDVAR]], 1
 ; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp slt i64 [[INDVAR_NEXT]], [[TMP0]]
@@ -42,37 +46,6 @@ define void @foo(i32* %A, i32 %N, i32 %M) {
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ;
-; DONTWIDEN-LABEL: @foo(
-; DONTWIDEN-NEXT:  entry:
-; DONTWIDEN-NEXT:    [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DONTWIDEN-NEXT:    br i1 [[CMP17]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DONTWIDEN:       for.cond1.preheader.lr.ph:
-; DONTWIDEN-NEXT:    [[CMP215:%.*]] = icmp sgt i32 [[M:%.*]], 0
-; DONTWIDEN-NEXT:    br i1 [[CMP215]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND_CLEANUP]]
-; DONTWIDEN:       for.cond1.preheader.us.preheader:
-; DONTWIDEN-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
-; DONTWIDEN:       for.cond1.preheader.us:
-; DONTWIDEN-NEXT:    [[I_018_US:%.*]] = phi i32 [ [[INC6_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
-; DONTWIDEN-NEXT:    [[MUL_US:%.*]] = mul nsw i32 [[I_018_US]], [[M]]
-; DONTWIDEN-NEXT:    br label [[FOR_BODY4_US:%.*]]
-; DONTWIDEN:       for.body4.us:
-; DONTWIDEN-NEXT:    [[J_016_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY4_US]] ]
-; DONTWIDEN-NEXT:    [[ADD_US:%.*]] = add nsw i32 [[J_016_US]], [[MUL_US]]
-; DONTWIDEN-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64
-; DONTWIDEN-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]]
-; DONTWIDEN-NEXT:    tail call void @f(i32* [[ARRAYIDX_US]])
-; DONTWIDEN-NEXT:    [[INC_US]] = add nuw nsw i32 [[J_016_US]], 1
-; DONTWIDEN-NEXT:    [[CMP2_US:%.*]] = icmp slt i32 [[INC_US]], [[M]]
-; DONTWIDEN-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY4_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
-; DONTWIDEN:       for.cond1.for.cond.cleanup3_crit_edge.us:
-; DONTWIDEN-NEXT:    [[INC6_US]] = add nuw nsw i32 [[I_018_US]], 1
-; DONTWIDEN-NEXT:    [[CMP_US:%.*]] = icmp slt i32 [[INC6_US]], [[N]]
-; DONTWIDEN-NEXT:    br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; DONTWIDEN:       for.cond.cleanup.loopexit:
-; DONTWIDEN-NEXT:    br label [[FOR_COND_CLEANUP]]
-; DONTWIDEN:       for.cond.cleanup:
-; DONTWIDEN-NEXT:    ret void
-;
 entry:
   %cmp17 = icmp sgt i32 %N, 0
   br i1 %cmp17, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup
@@ -108,4 +81,452 @@ for.cond.cleanup:
   ret void
 }
 
+define void @zext(i32 %N, i16* nocapture %A, i16 %val) {
+; CHECK-LABEL: @zext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP20_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP20_NOT]], label [[FOR_END9:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader.us.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[FLATTEN_TRIPCOUNT:%.*]] = mul i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
+; CHECK:       for.cond1.preheader.us:
+; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i64 [ [[INDVAR_NEXT2:%.*]], [[FOR_COND1_FOR_INC7_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVAR1]] to i32
+; CHECK-NEXT:    [[MUL_US:%.*]] = mul i32 [[TMP2]], [[N]]
+; CHECK-NEXT:    [[FLATTEN_TRUNCIV:%.*]] = trunc i64 [[INDVAR1]] to i32
+; CHECK-NEXT:    br label [[FOR_BODY3_US:%.*]]
+; CHECK:       for.body3.us:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVAR]] to i32
+; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[TMP3]], [[MUL_US]]
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext i32 [[FLATTEN_TRUNCIV]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i16, i16* [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX_US]], align 2
+; CHECK-NEXT:    [[ADD5_US:%.*]] = add i16 [[TMP4]], [[VAL:%.*]]
+; CHECK-NEXT:    store i16 [[ADD5_US]], i16* [[ARRAYIDX_US]], align 2
+; CHECK-NEXT:    [[INDVAR_NEXT:%.*]] = add i64 [[INDVAR]], 1
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ult i64 [[INDVAR_NEXT]], [[TMP0]]
+; CHECK-NEXT:    br label [[FOR_COND1_FOR_INC7_CRIT_EDGE_US]]
+; CHECK:       for.cond1.for.inc7_crit_edge.us:
+; CHECK-NEXT:    [[INDVAR_NEXT2]] = add i64 [[INDVAR1]], 1
+; CHECK-NEXT:    [[CMP_US:%.*]] = icmp ult i64 [[INDVAR_NEXT2]], [[FLATTEN_TRIPCOUNT]]
+; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_END9_LOOPEXIT:%.*]]
+; CHECK:       for.end9.loopexit:
+; CHECK-NEXT:    br label [[FOR_END9]]
+; CHECK:       for.end9:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp20.not = icmp eq i32 %N, 0
+  br i1 %cmp20.not, label %for.end9, label %for.cond1.preheader.us.preheader
+
+for.cond1.preheader.us.preheader:
+  br label %for.cond1.preheader.us
+
+for.cond1.preheader.us:
+  %i.021.us = phi i32 [ %inc8.us, %for.cond1.for.inc7_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
+  %mul.us = mul i32 %i.021.us, %N
+  br label %for.body3.us
+
+for.body3.us:
+  %j.019.us = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us, %for.body3.us ]
+  %add.us = add i32 %j.019.us, %mul.us
+  %idxprom.us = zext i32 %add.us to i64
+  %arrayidx.us = getelementptr inbounds i16, i16* %A, i64 %idxprom.us
+  %0 = load i16, i16* %arrayidx.us, align 2
+  %add5.us = add i16 %0, %val
+  store i16 %add5.us, i16* %arrayidx.us, align 2
+  %inc.us = add nuw i32 %j.019.us, 1
+  %cmp2.us = icmp ult i32 %inc.us, %N
+  br i1 %cmp2.us, label %for.body3.us, label %for.cond1.for.inc7_crit_edge.us
+
+for.cond1.for.inc7_crit_edge.us:
+  %inc8.us = add i32 %i.021.us, 1
+  %cmp.us = icmp ult i32 %inc8.us, %N
+  br i1 %cmp.us, label %for.cond1.preheader.us, label %for.end9.loopexit
+
+for.end9.loopexit:
+  br label %for.end9
+
+for.end9:
+  ret void
+}
+
+; This IR corresponds to this input:
+;
+; void test(char n, char m) {
+;   for(char i = 0; i < n; i++)
+;     for(char j = 0; j < m; j++) {
+;       char x = i*m+j;
+;       use_32(x);
+;     }
+; }
+;
+define void @test(i8 %n, i8 %m) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP25_NOT:%.*]] = icmp eq i8 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP25_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND3_PREHEADER_LR_PH:%.*]]
+; CHECK:       for.cond3.preheader.lr.ph:
+; CHECK-NEXT:    [[CMP623_NOT:%.*]] = icmp eq i8 [[M:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP623_NOT]], label [[FOR_COND3_PREHEADER_PREHEADER:%.*]], label [[FOR_COND3_PREHEADER_US_PREHEADER:%.*]]
+; CHECK:       for.cond3.preheader.preheader:
+; CHECK-NEXT:    br label [[FOR_COND3_PREHEADER:%.*]]
+; CHECK:       for.cond3.preheader.us.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[M]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[N]] to i64
+; CHECK-NEXT:    [[FLATTEN_TRIPCOUNT:%.*]] = mul i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br label [[FOR_COND3_PREHEADER_US:%.*]]
+; CHECK:       for.cond3.preheader.us:
+; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i64 [ [[INDVAR_NEXT3:%.*]], [[FOR_COND3_FOR_COND_CLEANUP8_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND3_PREHEADER_US_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVAR2]] to i8
+; CHECK-NEXT:    [[MUL_US:%.*]] = mul i8 [[TMP2]], [[M]]
+; CHECK-NEXT:    [[FLATTEN_TRUNCIV:%.*]] = trunc i64 [[INDVAR2]] to i8
+; CHECK-NEXT:    br label [[FOR_BODY9_US:%.*]]
+; CHECK:       for.body9.us:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ 0, [[FOR_COND3_PREHEADER_US]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVAR]] to i8
+; CHECK-NEXT:    [[ADD_US:%.*]] = add i8 [[TMP3]], [[MUL_US]]
+; CHECK-NEXT:    [[CONV14_US:%.*]] = zext i8 [[FLATTEN_TRUNCIV]] to i32
+; CHECK-NEXT:    [[CALL_US:%.*]] = tail call i32 @use_32(i32 [[CONV14_US]])
+; CHECK-NEXT:    [[INDVAR_NEXT:%.*]] = add i64 [[INDVAR]], 1
+; CHECK-NEXT:    [[CMP6_US:%.*]] = icmp ult i64 [[INDVAR_NEXT]], [[TMP0]]
+; CHECK-NEXT:    br label [[FOR_COND3_FOR_COND_CLEANUP8_CRIT_EDGE_US]]
+; CHECK:       for.cond3.for.cond.cleanup8_crit_edge.us:
+; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
+; CHECK-NEXT:    [[CMP_US:%.*]] = icmp ult i64 [[INDVAR_NEXT3]], [[FLATTEN_TRIPCOUNT]]
+; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_COND3_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT1:%.*]]
+; CHECK:       for.cond3.preheader:
+; CHECK-NEXT:    [[I_026:%.*]] = phi i8 [ [[INC16:%.*]], [[FOR_COND3_PREHEADER]] ], [ 0, [[FOR_COND3_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[INC16]] = add i8 [[I_026]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[INC16]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND3_PREHEADER]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup.loopexit1:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp25.not = icmp eq i8 %n, 0
+  br i1 %cmp25.not, label %for.cond.cleanup, label %for.cond3.preheader.lr.ph
+
+for.cond3.preheader.lr.ph:
+  %cmp623.not = icmp eq i8 %m, 0
+  br i1 %cmp623.not, label %for.cond3.preheader.preheader, label %for.cond3.preheader.us.preheader
+
+for.cond3.preheader.preheader:
+  br label %for.cond3.preheader
+
+for.cond3.preheader.us.preheader:
+  br label %for.cond3.preheader.us
+
+for.cond3.preheader.us:
+  %i.026.us = phi i8 [ %inc16.us, %for.cond3.for.cond.cleanup8_crit_edge.us ], [ 0, %for.cond3.preheader.us.preheader ]
+  %mul.us = mul i8 %i.026.us, %m
+  br label %for.body9.us
+
+for.body9.us:
+  %j.024.us = phi i8 [ 0, %for.cond3.preheader.us ], [ %inc.us, %for.body9.us ]
+  %add.us = add i8 %j.024.us, %mul.us
+  %conv14.us = zext i8 %add.us to i32
+  %call.us = tail call i32 @use_32(i32 %conv14.us) #2
+  %inc.us = add nuw i8 %j.024.us, 1
+  %cmp6.us = icmp ult i8 %inc.us, %m
+  br i1 %cmp6.us, label %for.body9.us, label %for.cond3.for.cond.cleanup8_crit_edge.us
+
+for.cond3.for.cond.cleanup8_crit_edge.us:
+  %inc16.us = add i8 %i.026.us, 1
+  %cmp.us = icmp ult i8 %inc16.us, %n
+  br i1 %cmp.us, label %for.cond3.preheader.us, label %for.cond.cleanup
+
+for.cond3.preheader:
+  %i.026 = phi i8 [ %inc16, %for.cond3.preheader ], [ 0, %for.cond3.preheader.preheader ]
+  %inc16 = add i8 %i.026, 1
+  %cmp = icmp ult i8 %inc16, %n
+  br i1 %cmp, label %for.cond3.preheader, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; This IR corresponds to this input:
+;
+; void test3(char n, char m) {
+;   for(char i = 0; i < n; i++)
+;     for(char j = 0; j < m; j++) {
+;       char x = i*m+j;
+;       use_32(x);
+;       use_16(x);
+;       use_32(x);
+;       use_16(x);
+;       use_64(x);
+;     }
+; }
+;
+define void @test3(i8 %n, i8 %m) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP37_NOT:%.*]] = icmp eq i8 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP37_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND3_PREHEADER_LR_PH:%.*]]
+; CHECK:       for.cond3.preheader.lr.ph:
+; CHECK-NEXT:    [[CMP635_NOT:%.*]] = icmp eq i8 [[M:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP635_NOT]], label [[FOR_COND3_PREHEADER_PREHEADER:%.*]], label [[FOR_COND3_PREHEADER_US_PREHEADER:%.*]]
+; CHECK:       for.cond3.preheader.preheader:
+; CHECK-NEXT:    br label [[FOR_COND3_PREHEADER:%.*]]
+; CHECK:       for.cond3.preheader.us.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[M]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[N]] to i64
+; CHECK-NEXT:    [[FLATTEN_TRIPCOUNT:%.*]] = mul i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br label [[FOR_COND3_PREHEADER_US:%.*]]
+; CHECK:       for.cond3.preheader.us:
+; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i64 [ [[INDVAR_NEXT3:%.*]], [[FOR_COND3_FOR_COND_CLEANUP8_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND3_PREHEADER_US_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVAR2]] to i8
+; CHECK-NEXT:    [[MUL_US:%.*]] = mul i8 [[TMP2]], [[M]]
+; CHECK-NEXT:    [[FLATTEN_TRUNCIV:%.*]] = trunc i64 [[INDVAR2]] to i8
+; CHECK-NEXT:    br label [[FOR_BODY9_US:%.*]]
+; CHECK:       for.body9.us:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ 0, [[FOR_COND3_PREHEADER_US]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVAR]] to i8
+; CHECK-NEXT:    [[ADD_US:%.*]] = add i8 [[TMP3]], [[MUL_US]]
+; CHECK-NEXT:    [[CONV14_US:%.*]] = zext i8 [[FLATTEN_TRUNCIV]] to i32
+; CHECK-NEXT:    [[CALL_US:%.*]] = tail call i32 @use_32(i32 [[CONV14_US]])
+; CHECK-NEXT:    [[CONV15_US:%.*]] = zext i8 [[FLATTEN_TRUNCIV]] to i16
+; CHECK-NEXT:    [[CALL16_US:%.*]] = tail call i32 @use_16(i16 [[CONV15_US]])
+; CHECK-NEXT:    [[CALL18_US:%.*]] = tail call i32 @use_32(i32 [[CONV14_US]])
+; CHECK-NEXT:    [[CALL20_US:%.*]] = tail call i32 @use_16(i16 [[CONV15_US]])
+; CHECK-NEXT:    [[CONV21_US:%.*]] = zext i8 [[FLATTEN_TRUNCIV]] to i64
+; CHECK-NEXT:    [[CALL22_US:%.*]] = tail call i32 @use_64(i64 [[CONV21_US]])
+; CHECK-NEXT:    [[INDVAR_NEXT:%.*]] = add i64 [[INDVAR]], 1
+; CHECK-NEXT:    [[CMP6_US:%.*]] = icmp ult i64 [[INDVAR_NEXT]], [[TMP0]]
+; CHECK-NEXT:    br label [[FOR_COND3_FOR_COND_CLEANUP8_CRIT_EDGE_US]]
+; CHECK:       for.cond3.for.cond.cleanup8_crit_edge.us:
+; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
+; CHECK-NEXT:    [[CMP_US:%.*]] = icmp ult i64 [[INDVAR_NEXT3]], [[FLATTEN_TRIPCOUNT]]
+; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_COND3_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT1:%.*]]
+; CHECK:       for.cond3.preheader:
+; CHECK-NEXT:    [[I_038:%.*]] = phi i8 [ [[INC24:%.*]], [[FOR_COND3_PREHEADER]] ], [ 0, [[FOR_COND3_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[INC24]] = add i8 [[I_038]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[INC24]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND3_PREHEADER]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup.loopexit1:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp37.not = icmp eq i8 %n, 0
+  br i1 %cmp37.not, label %for.cond.cleanup, label %for.cond3.preheader.lr.ph
+
+for.cond3.preheader.lr.ph:
+  %cmp635.not = icmp eq i8 %m, 0
+  br i1 %cmp635.not, label %for.cond3.preheader.preheader, label %for.cond3.preheader.us.preheader
+
+for.cond3.preheader.preheader:
+  br label %for.cond3.preheader
+
+for.cond3.preheader.us.preheader:
+  br label %for.cond3.preheader.us
+
+for.cond3.preheader.us:
+  %i.038.us = phi i8 [ %inc24.us, %for.cond3.for.cond.cleanup8_crit_edge.us ], [ 0, %for.cond3.preheader.us.preheader ]
+  %mul.us = mul i8 %i.038.us, %m
+  br label %for.body9.us
+
+for.body9.us:
+  %j.036.us = phi i8 [ 0, %for.cond3.preheader.us ], [ %inc.us, %for.body9.us ]
+  %add.us = add i8 %j.036.us, %mul.us
+  %conv14.us = zext i8 %add.us to i32
+  %call.us = tail call i32 @use_32(i32 %conv14.us)
+  %conv15.us = zext i8 %add.us to i16
+  %call16.us = tail call i32 @use_16(i16 %conv15.us)
+  %call18.us = tail call i32 @use_32(i32 %conv14.us)
+  %call20.us = tail call i32 @use_16(i16 %conv15.us)
+  %conv21.us = zext i8 %add.us to i64
+  %call22.us = tail call i32 @use_64(i64 %conv21.us)
+  %inc.us = add nuw i8 %j.036.us, 1
+  %cmp6.us = icmp ult i8 %inc.us, %m
+  br i1 %cmp6.us, label %for.body9.us, label %for.cond3.for.cond.cleanup8_crit_edge.us
+
+for.cond3.for.cond.cleanup8_crit_edge.us:
+  %inc24.us = add i8 %i.038.us, 1
+  %cmp.us = icmp ult i8 %inc24.us, %n
+  br i1 %cmp.us, label %for.cond3.preheader.us, label %for.cond.cleanup
+
+for.cond3.preheader:
+  %i.038 = phi i8 [ %inc24, %for.cond3.preheader ], [ 0, %for.cond3.preheader.preheader ]
+  %inc24 = add i8 %i.038, 1
+  %cmp = icmp ult i8 %inc24, %n
+  br i1 %cmp, label %for.cond3.preheader, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; This IR corresponds to this input:
+;
+; void test4(short n, short m) {
+;   for(short i = 0; i < n; i++)
+;     for(short j = 0; j < m; j++) {
+;       short x = i*m+j;
+;       use_32(x);
+;       use_16(x);
+;       use_32(x);
+;       use_16(x);
+;       use_64(x);
+;     }
+; }
+;
+define void @test4(i16 %n, i16 %m) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP38:%.*]] = icmp sgt i16 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP38]], label [[FOR_COND3_PREHEADER_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.cond3.preheader.lr.ph:
+; CHECK-NEXT:    [[CMP636:%.*]] = icmp sgt i16 [[M:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP636]], label [[FOR_COND3_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND3_PREHEADER_PREHEADER:%.*]]
+; CHECK:       for.cond3.preheader.preheader:
+; CHECK-NEXT:    br label [[FOR_COND3_PREHEADER:%.*]]
+; CHECK:       for.cond3.preheader.us.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i16 [[M]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[N]] to i64
+; CHECK-NEXT:    [[FLATTEN_TRIPCOUNT:%.*]] = mul i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br label [[FOR_COND3_PREHEADER_US:%.*]]
+; CHECK:       for.cond3.preheader.us:
+; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i64 [ [[INDVAR_NEXT3:%.*]], [[FOR_COND3_FOR_COND_CLEANUP8_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND3_PREHEADER_US_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVAR2]] to i16
+; CHECK-NEXT:    [[MUL_US:%.*]] = mul i16 [[TMP2]], [[M]]
+; CHECK-NEXT:    [[FLATTEN_TRUNCIV:%.*]] = trunc i64 [[INDVAR2]] to i16
+; CHECK-NEXT:    br label [[FOR_BODY9_US:%.*]]
+; CHECK:       for.body9.us:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ 0, [[FOR_COND3_PREHEADER_US]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVAR]] to i16
+; CHECK-NEXT:    [[ADD_US:%.*]] = add i16 [[TMP3]], [[MUL_US]]
+; CHECK-NEXT:    [[CONV14_US:%.*]] = sext i16 [[FLATTEN_TRUNCIV]] to i32
+; CHECK-NEXT:    [[CALL_US:%.*]] = tail call i32 @use_32(i32 [[CONV14_US]])
+; CHECK-NEXT:    [[CALL15_US:%.*]] = tail call i32 @use_16(i16 [[FLATTEN_TRUNCIV]])
+; CHECK-NEXT:    [[CALL17_US:%.*]] = tail call i32 @use_32(i32 [[CONV14_US]])
+; CHECK-NEXT:    [[CALL18_US:%.*]] = tail call i32 @use_16(i16 [[FLATTEN_TRUNCIV]])
+; CHECK-NEXT:    [[CONV19_US:%.*]] = sext i16 [[FLATTEN_TRUNCIV]] to i64
+; CHECK-NEXT:    [[CALL20_US:%.*]] = tail call i32 @use_64(i64 [[CONV19_US]])
+; CHECK-NEXT:    [[INDVAR_NEXT:%.*]] = add i64 [[INDVAR]], 1
+; CHECK-NEXT:    [[CMP6_US:%.*]] = icmp slt i64 [[INDVAR_NEXT]], [[TMP0]]
+; CHECK-NEXT:    br label [[FOR_COND3_FOR_COND_CLEANUP8_CRIT_EDGE_US]]
+; CHECK:       for.cond3.for.cond.cleanup8_crit_edge.us:
+; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
+; CHECK-NEXT:    [[CMP_US:%.*]] = icmp slt i64 [[INDVAR_NEXT3]], [[FLATTEN_TRIPCOUNT]]
+; CHECK-NEXT:    br i1 [[CMP_US]], label [[FOR_COND3_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.cond3.preheader:
+; CHECK-NEXT:    [[I_039:%.*]] = phi i16 [ [[INC22:%.*]], [[FOR_COND3_PREHEADER]] ], [ 0, [[FOR_COND3_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[INC22]] = add i16 [[I_039]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i16 [[INC22]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND3_PREHEADER]], label [[FOR_COND_CLEANUP_LOOPEXIT1:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup.loopexit1:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; DONTWIDEN-LABEL: @test4(
+; DONTWIDEN-NEXT:  entry:
+; DONTWIDEN-NEXT:    [[CMP38:%.*]] = icmp sgt i16 [[N:%.*]], 0
+; DONTWIDEN-NEXT:    br i1 [[CMP38]], label [[FOR_COND3_PREHEADER_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DONTWIDEN:       for.cond3.preheader.lr.ph:
+; DONTWIDEN-NEXT:    [[CMP636:%.*]] = icmp sgt i16 [[M:%.*]], 0
+; DONTWIDEN-NEXT:    br i1 [[CMP636]], label [[FOR_COND3_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND3_PREHEADER_PREHEADER:%.*]]
+; DONTWIDEN:       for.cond3.preheader.preheader:
+; DONTWIDEN-NEXT:    br label [[FOR_COND3_PREHEADER:%.*]]
+; DONTWIDEN:       for.cond3.preheader.us.preheader:
+; DONTWIDEN-NEXT:    br label [[FOR_COND3_PREHEADER_US:%.*]]
+; DONTWIDEN:       for.cond3.preheader.us:
+; DONTWIDEN-NEXT:    [[I_039_US:%.*]] = phi i16 [ [[INC22_US:%.*]], [[FOR_COND3_FOR_COND_CLEANUP8_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND3_PREHEADER_US_PREHEADER]] ]
+; DONTWIDEN-NEXT:    [[MUL_US:%.*]] = mul i16 [[I_039_US]], [[M]]
+; DONTWIDEN-NEXT:    br label [[FOR_BODY9_US:%.*]]
+; DONTWIDEN:       for.body9.us:
+; DONTWIDEN-NEXT:    [[J_037_US:%.*]] = phi i16 [ 0, [[FOR_COND3_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY9_US]] ]
+; DONTWIDEN-NEXT:    [[ADD_US:%.*]] = add i16 [[J_037_US]], [[MUL_US]]
+; DONTWIDEN-NEXT:    [[CONV14_US:%.*]] = sext i16 [[ADD_US]] to i32
+; DONTWIDEN-NEXT:    [[CALL_US:%.*]] = tail call i32 @use_32(i32 [[CONV14_US]])
+; DONTWIDEN-NEXT:    [[CALL15_US:%.*]] = tail call i32 @use_16(i16 [[ADD_US]])
+; DONTWIDEN-NEXT:    [[CALL17_US:%.*]] = tail call i32 @use_32(i32 [[CONV14_US]])
+; DONTWIDEN-NEXT:    [[CALL18_US:%.*]] = tail call i32 @use_16(i16 [[ADD_US]])
+; DONTWIDEN-NEXT:    [[CONV19_US:%.*]] = sext i16 [[ADD_US]] to i64
+; DONTWIDEN-NEXT:    [[CALL20_US:%.*]] = tail call i32 @use_64(i64 [[CONV19_US]])
+; DONTWIDEN-NEXT:    [[INC_US]] = add nuw nsw i16 [[J_037_US]], 1
+; DONTWIDEN-NEXT:    [[CMP6_US:%.*]] = icmp slt i16 [[INC_US]], [[M]]
+; DONTWIDEN-NEXT:    br i1 [[CMP6_US]], label [[FOR_BODY9_US]], label [[FOR_COND3_FOR_COND_CLEANUP8_CRIT_EDGE_US]]
+; DONTWIDEN:       for.cond3.for.cond.cleanup8_crit_edge.us:
+; DONTWIDEN-NEXT:    [[INC22_US]] = add i16 [[I_039_US]], 1
+; DONTWIDEN-NEXT:    [[CMP_US:%.*]] = icmp slt i16 [[INC22_US]], [[N]]
+; DONTWIDEN-NEXT:    br i1 [[CMP_US]], label [[FOR_COND3_PREHEADER_US]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; DONTWIDEN:       for.cond3.preheader:
+; DONTWIDEN-NEXT:    [[I_039:%.*]] = phi i16 [ [[INC22:%.*]], [[FOR_COND3_PREHEADER]] ], [ 0, [[FOR_COND3_PREHEADER_PREHEADER]] ]
+; DONTWIDEN-NEXT:    [[INC22]] = add i16 [[I_039]], 1
+; DONTWIDEN-NEXT:    [[CMP:%.*]] = icmp slt i16 [[INC22]], [[N]]
+; DONTWIDEN-NEXT:    br i1 [[CMP]], label [[FOR_COND3_PREHEADER]], label [[FOR_COND_CLEANUP_LOOPEXIT1:%.*]]
+; DONTWIDEN:       for.cond.cleanup.loopexit:
+; DONTWIDEN-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DONTWIDEN:       for.cond.cleanup.loopexit1:
+; DONTWIDEN-NEXT:    br label [[FOR_COND_CLEANUP]]
+; DONTWIDEN:       for.cond.cleanup:
+; DONTWIDEN-NEXT:    ret void
+;
+entry:
+  %cmp38 = icmp sgt i16 %n, 0
+  br i1 %cmp38, label %for.cond3.preheader.lr.ph, label %for.cond.cleanup
+
+for.cond3.preheader.lr.ph:
+  %cmp636 = icmp sgt i16 %m, 0
+  br i1 %cmp636, label %for.cond3.preheader.us.preheader, label %for.cond3.preheader.preheader
+
+for.cond3.preheader.preheader:
+  br label %for.cond3.preheader
+
+for.cond3.preheader.us.preheader:
+  br label %for.cond3.preheader.us
+
+for.cond3.preheader.us:
+  %i.039.us = phi i16 [ %inc22.us, %for.cond3.for.cond.cleanup8_crit_edge.us ], [ 0, %for.cond3.preheader.us.preheader ]
+  %mul.us = mul i16 %i.039.us, %m
+  br label %for.body9.us
+
+for.body9.us:
+  %j.037.us = phi i16 [ 0, %for.cond3.preheader.us ], [ %inc.us, %for.body9.us ]
+  %add.us = add i16 %j.037.us, %mul.us
+  %conv14.us = sext i16 %add.us to i32
+  %call.us = tail call i32 @use_32(i32 %conv14.us) #2
+  %call15.us = tail call i32 @use_16(i16 %add.us) #2
+  %call17.us = tail call i32 @use_32(i32 %conv14.us) #2
+  %call18.us = tail call i32 @use_16(i16 %add.us) #2
+  %conv19.us = sext i16 %add.us to i64
+  %call20.us = tail call i32 @use_64(i64 %conv19.us) #2
+  %inc.us = add nuw nsw i16 %j.037.us, 1
+  %cmp6.us = icmp slt i16 %inc.us, %m
+  br i1 %cmp6.us, label %for.body9.us, label %for.cond3.for.cond.cleanup8_crit_edge.us
+
+for.cond3.for.cond.cleanup8_crit_edge.us:
+  %inc22.us = add i16 %i.039.us, 1
+  %cmp.us = icmp slt i16 %inc22.us, %n
+  br i1 %cmp.us, label %for.cond3.preheader.us, label %for.cond.cleanup
+
+for.cond3.preheader:
+  %i.039 = phi i16 [ %inc22, %for.cond3.preheader ], [ 0, %for.cond3.preheader.preheader ]
+  %inc22 = add i16 %i.039, 1
+  %cmp = icmp slt i16 %inc22, %n
+  br i1 %cmp, label %for.cond3.preheader, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+declare dso_local i32 @use_32(i32)
+declare dso_local i32 @use_16(i16)
+declare dso_local i32 @use_64(i64)
+
 declare dso_local void @f(i32* %0) local_unnamed_addr #1