[Mlir-commits] [clang] [llvm] [mlir] [openmp] [LoopTiling][Clang][MLIR] Canonical Intra-tile Loops (PR #191114)

Wed Apr 15 05:27:43 PDT 2026

llvmbot wrote:




@llvm/pr-subscribers-mlir-llvm

Author: Amit Tiwari (amitamd7)

<details>
<summary>Changes</summary>

This PR canonicalizes the Intra-tile in Loop Tiling.

---

Patch is 672.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/191114.diff


23 Files Affected:

- (modified) clang/lib/Sema/SemaOpenMP.cpp (+121-46) 
- (modified) clang/test/OpenMP/interchange_codegen.cpp (+1734-2449) 
- (modified) clang/test/OpenMP/irbuilder_unroll_partial_factor_for.c (+34-33) 
- (modified) clang/test/OpenMP/irbuilder_unroll_partial_heuristic_constant_for.c (+44-43) 
- (modified) clang/test/OpenMP/irbuilder_unroll_partial_heuristic_runtime_for.c (+45-44) 
- (modified) clang/test/OpenMP/irbuilder_unroll_unroll_partial_factor.c (+33-32) 
- (modified) clang/test/OpenMP/irbuilder_unroll_unroll_partial_heuristic.c (+34-33) 
- (modified) clang/test/OpenMP/tile_codegen.cpp (+1099-1405) 
- (modified) clang/test/OpenMP/tile_codegen_for_dependent.cpp (+146-162) 
- (modified) clang/test/OpenMP/tile_codegen_tile_for.cpp (+193-224) 
- (modified) clang/test/OpenMP/tile_messages.cpp (+1-1) 
- (added) clang/test/OpenMP/tile_rect_codegen.cpp (+50) 
- (added) clang/test/OpenMP/tile_rect_codegen_ir.cpp (+84) 
- (modified) clang/test/OpenMP/unroll_codegen_tile_for.cpp (+190-214) 
- (modified) llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp (+39-25) 
- (modified) llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp (+30-5) 
- (modified) mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir (+10-8) 
- (modified) mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir (+16-14) 
- (modified) mlir/test/Target/LLVMIR/openmp-cli-tile03.mlir (+63-57) 
- (modified) openmp/runtime/test/transform/tile/foreach.cpp (+36) 
- (modified) openmp/runtime/test/transform/tile/intfor.c (+39-39) 
- (modified) openmp/runtime/test/transform/tile/iterfor.cpp (+27) 
- (modified) openmp/runtime/test/transform/tile/parallel-wsloop-collapse-foreach.cpp (+108) 


``````````diff

diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index fada37ba45755..0aece2f027fe3 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -14957,8 +14957,10 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
   // Create iteration variables for the generated loops.
   SmallVector<VarDecl *, 4> FloorIndVars;
   SmallVector<VarDecl *, 4> TileIndVars;
+  SmallVector<VarDecl *, 4> TileCntVars;
   FloorIndVars.resize(NumLoops);
   TileIndVars.resize(NumLoops);
+  TileCntVars.resize(NumLoops);
   for (unsigned I = 0; I < NumLoops; ++I) {
     OMPLoopBasedDirective::HelperExprs &LoopHelper = LoopHelpers[I];
 
@@ -14978,27 +14980,101 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
       FloorIndVars[I] = FloorCntDecl;
     }
 
-    // Iteration variable for the tile (i.e. inner) loop.
+    // Logical iteration variable for the tile loop. Retains the meaning of
+    // the original logical iteration number (floor_iv + tile_cnt) so that
+    // LoopHelper.Updates can derive the original loop variable unchanged.
     {
-      std::string TileCntName =
+      std::string TileIVName =
           (Twine(".tile_") + llvm::utostr(I) + ".iv." + OrigVarName).str();
 
-      // Reuse the iteration variable created by checkOpenMPLoop. It is also
-      // used by the expressions to derive the original iteration variable's
-      // value from the logical iteration number.
-      auto *TileCntDecl = cast<VarDecl>(IterVarRef->getDecl());
-      TileCntDecl->setDeclName(
-          &SemaRef.PP.getIdentifierTable().get(TileCntName));
-      TileIndVars[I] = TileCntDecl;
+      auto *TileIVDecl = cast<VarDecl>(IterVarRef->getDecl());
+      TileIVDecl->setDeclName(&SemaRef.PP.getIdentifierTable().get(TileIVName));
+      TileIndVars[I] = TileIVDecl;
+    }
+
+    // Loop counter for the rectangular tile loop [0, TileSize).
+    {
+      std::string TileCntName =
+          (Twine(".tile.cnt.") + llvm::utostr(I) + ".iv." + OrigVarName).str();
+      VarDecl *TileCntDecl =
+          buildVarDecl(SemaRef, {}, CntTy, TileCntName, nullptr, OrigCntVar);
+      TileCntVars[I] = TileCntDecl;
     }
 
     addLoopPreInits(Context, LoopHelper, LoopStmts[I], OriginalInits[I],
                     PreInits);
+
+    // Declare the logical tile IV in PreInits so it is in scope for the
+    // entire loop nest (it will be assigned in each tile loop body).
+    Decl *TileIVDeclPtr = TileIndVars[I];
+    PreInits.push_back(new (Context) DeclStmt(
+        DeclGroupRef::Create(Context, &TileIVDeclPtr, 1), {}, {}));
   }
 
   // Once the original iteration values are set, append the innermost body.
   Stmt *Inner = Body;
 
+  // Build a combined validity predicate that guards the innermost body.
+  // For each tiled dimension, check that the logical iteration number
+  // (.tile.iv) is within the original trip count. This is required because the
+  // tile loop now has rectangular (constant) bounds and may overshoot on the
+  // remainder tile. The predicate is: .tile.iv.0 < N0 && .tile.iv.1 < N1 ...
+  //
+  // Optimization: if every dimension's trip count is a compile-time constant
+  // that is evenly divisible by the corresponding tile size (also a constant),
+  // then the remainder tile is empty and the predicate is trivially true.
+  {
+    bool PredicateNeeded = false;
+    for (unsigned I = 0; I < NumLoops; ++I) {
+      Expr *TSExpr = SizesClause->getSizesRefs()[I];
+      Expr *NExpr = LoopHelpers[I].NumIterations;
+      llvm::APSInt TileVal, TripVal;
+      bool TSConst =
+          !TSExpr->containsErrors() && TSExpr->isIntegerConstantExpr(Context);
+      bool NConst = NExpr->isIntegerConstantExpr(Context);
+      if (TSConst && NConst) {
+        Expr::EvalResult TSResult;
+        TSExpr->EvaluateAsInt(TSResult, Context);
+        TileVal = TSResult.Val.getInt();
+        Expr::EvalResult NResult;
+        NExpr->EvaluateAsInt(NResult, Context);
+        TripVal = NResult.Val.getInt();
+        if (TileVal.isStrictlyPositive() && (TripVal.srem(TileVal)).isZero())
+          continue;
+      }
+      PredicateNeeded = true;
+      break;
+    }
+
+    if (PredicateNeeded) {
+      Expr *CombinedPred = nullptr;
+      for (unsigned I = 0; I < NumLoops; ++I) {
+        auto *OrigCntVar = cast<DeclRefExpr>(LoopHelpers[I].Counters[0]);
+        QualType IVTy = LoopHelpers[I].NumIterations->getType();
+        Expr *TileIVRef = buildDeclRefExpr(SemaRef, TileIndVars[I], IVTy,
+                                           OrigCntVar->getExprLoc());
+        ExprResult DimPred =
+            SemaRef.BuildBinOp(CurScope, OrigCntVar->getExprLoc(), BO_LT,
+                               TileIVRef, LoopHelpers[I].NumIterations);
+        if (!DimPred.isUsable())
+          return StmtError();
+        if (CombinedPred) {
+          ExprResult Combined =
+              SemaRef.BuildBinOp(CurScope, OrigCntVar->getExprLoc(), BO_LAnd,
+                                 CombinedPred, DimPred.get());
+          if (!Combined.isUsable())
+            return StmtError();
+          CombinedPred = Combined.get();
+        } else {
+          CombinedPred = DimPred.get();
+        }
+      }
+      Inner = IfStmt::Create(
+          Context, SourceLocation(), IfStatementKind::Ordinary, nullptr,
+          nullptr, CombinedPred, SourceLocation(), SourceLocation(), Inner);
+    }
+  }
+
   auto MakeDimTileSize = [&SemaRef = this->SemaRef, &CopyTransformer, &Context,
                           SizesClause, CurScope](int I) -> Expr * {
     Expr *DimTileSizeExpr = SizesClause->getSizesRefs()[I];
@@ -15006,7 +15082,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
     if (DimTileSizeExpr->containsErrors())
       return nullptr;
 
-    if (isa<ConstantExpr>(DimTileSizeExpr))
+    if (DimTileSizeExpr->isIntegerConstantExpr(Context))
       return AssertSuccess(CopyTransformer.TransformExpr(DimTileSizeExpr));
 
     // When the tile size is not a constant but a variable, it is possible to
@@ -15042,6 +15118,9 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
   };
 
   // Create tile loops from the inside to the outside.
+  // Each tile loop uses .tile.cnt as its counter with rectangular bounds
+  // [0, TileSize), and computes .tile.iv = .floor.iv + .tile.cnt to set
+  // the logical iteration number for LoopHelper.Updates.
   for (int I = NumLoops - 1; I >= 0; --I) {
     OMPLoopBasedDirective::HelperExprs &LoopHelper = LoopHelpers[I];
     Expr *NumIterations = LoopHelper.NumIterations;
@@ -15052,70 +15131,65 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
     // Commonly used variables. One of the constraints of an AST is that every
     // node object must appear at most once, hence we define a lambda that
     // creates a new AST node at every use.
+    auto MakeTileCntRef = [&SemaRef = this->SemaRef, &TileCntVars, I, IVTy,
+                           OrigCntVar]() {
+      return buildDeclRefExpr(SemaRef, TileCntVars[I], IVTy,
+                              OrigCntVar->getExprLoc());
+    };
     auto MakeTileIVRef = [&SemaRef = this->SemaRef, &TileIndVars, I, IVTy,
                           OrigCntVar]() {
       return buildDeclRefExpr(SemaRef, TileIndVars[I], IVTy,
                               OrigCntVar->getExprLoc());
     };
 
-    // For init-statement: auto .tile.iv = .floor.iv
+    // For init-statement: auto .tile.cnt = 0
     SemaRef.AddInitializerToDecl(
-        TileIndVars[I],
-        SemaRef
-            .DefaultLvalueConversion(
-                makeFloorIVRef(SemaRef, FloorIndVars, I, IVTy, OrigCntVar))
-            .get(),
+        TileCntVars[I],
+        SemaRef.ActOnIntegerConstant(LoopHelper.Init->getExprLoc(), 0).get(),
         /*DirectInit=*/false);
-    Decl *CounterDecl = TileIndVars[I];
+    Decl *CounterDecl = TileCntVars[I];
     StmtResult InitStmt = new (Context)
         DeclStmt(DeclGroupRef::Create(Context, &CounterDecl, 1),
                  OrigCntVar->getBeginLoc(), OrigCntVar->getEndLoc());
     if (!InitStmt.isUsable())
       return StmtError();
 
-    // For cond-expression:
-    //   .tile.iv < min(.floor.iv + DimTileSize, NumIterations)
+    // For cond-expression: .tile.cnt < DimTileSize  (rectangular bound)
     Expr *DimTileSize = MakeDimTileSize(I);
     if (!DimTileSize)
       return StmtError();
-    ExprResult EndOfTile = SemaRef.BuildBinOp(
-        CurScope, LoopHelper.Cond->getExprLoc(), BO_Add,
-        makeFloorIVRef(SemaRef, FloorIndVars, I, IVTy, OrigCntVar),
-        DimTileSize);
-    if (!EndOfTile.isUsable())
-      return StmtError();
-    ExprResult IsPartialTile =
-        SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT,
-                           NumIterations, EndOfTile.get());
-    if (!IsPartialTile.isUsable())
-      return StmtError();
-    ExprResult MinTileAndIterSpace = SemaRef.ActOnConditionalOp(
-        LoopHelper.Cond->getBeginLoc(), LoopHelper.Cond->getEndLoc(),
-        IsPartialTile.get(), NumIterations, EndOfTile.get());
-    if (!MinTileAndIterSpace.isUsable())
-      return StmtError();
     ExprResult CondExpr =
         SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT,
-                           MakeTileIVRef(), MinTileAndIterSpace.get());
+                           MakeTileCntRef(), DimTileSize);
     if (!CondExpr.isUsable())
       return StmtError();
 
-    // For incr-statement: ++.tile.iv
+    // For incr-statement: ++.tile.cnt
     ExprResult IncrStmt = SemaRef.BuildUnaryOp(
-        CurScope, LoopHelper.Inc->getExprLoc(), UO_PreInc, MakeTileIVRef());
+        CurScope, LoopHelper.Inc->getExprLoc(), UO_PreInc, MakeTileCntRef());
     if (!IncrStmt.isUsable())
       return StmtError();
 
-    // Statements to set the original iteration variable's value from the
-    // logical iteration number.
+    // Compute the logical iteration number:
+    //   .tile.iv = .floor.iv + .tile.cnt
+    ExprResult FloorPlusCnt = SemaRef.BuildBinOp(
+        CurScope, OrigCntVar->getExprLoc(), BO_Add,
+        makeFloorIVRef(SemaRef, FloorIndVars, I, IVTy, OrigCntVar),
+        MakeTileCntRef());
+    if (!FloorPlusCnt.isUsable())
+      return StmtError();
+    ExprResult TileIVAssign =
+        SemaRef.BuildBinOp(CurScope, OrigCntVar->getExprLoc(), BO_Assign,
+                           MakeTileIVRef(), FloorPlusCnt.get());
+    if (!TileIVAssign.isUsable())
+      return StmtError();
+
     // Generated for loop is:
     // \code
-    // Original_for_init;
-    // for (auto .tile.iv = .floor.iv;
-    //      .tile.iv < min(.floor.iv + DimTileSize, NumIterations);
-    //      ++.tile.iv) {
-    //   Original_Body;
-    //   Original_counter_update;
+    // for (auto .tile.cnt = 0; .tile.cnt < DimTileSize; ++.tile.cnt) {
+    //   .tile.iv = .floor.iv + .tile.cnt;
+    //   Original_counter_update;  // derives orig var from .tile.iv
+    //   Inner;                    // predicated body or inner tile loops
     // }
     // \endcode
     // FIXME: If the innermost body is an loop itself, inserting these
@@ -15123,6 +15197,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
     // for applying tiling again). If this is the case, sink the expressions
     // further into the inner loop.
     SmallVector<Stmt *, 4> BodyParts;
+    BodyParts.push_back(TileIVAssign.get());
     BodyParts.append(LoopHelper.Updates.begin(), LoopHelper.Updates.end());
     if (auto *SourceCXXFor = dyn_cast<CXXForRangeStmt>(LoopStmt))
       BodyParts.push_back(SourceCXXFor->getLoopVarStmt());
diff --git a/clang/test/OpenMP/interchange_codegen.cpp b/clang/test/OpenMP/interchange_codegen.cpp
index 8e833c9df324c..b062d42c9f162 100644
--- a/clang/test/OpenMP/interchange_codegen.cpp
+++ b/clang/test/OpenMP/interchange_codegen.cpp
@@ -123,6 +123,7 @@ extern "C" void foo10() {
 
 #endif /* HEADER */
 
+
 // CHECK1-LABEL: define {{[^@]+}}@body
 // CHECK1-SAME: (...) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
@@ -156,7 +157,7 @@ extern "C" void foo10() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP4]]
 // CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4
-// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]]
 // CHECK1:       for.end:
 // CHECK1-NEXT:    ret void
 //
@@ -262,14 +263,14 @@ extern "C" void foo10() {
 // CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
 // CHECK1-NEXT:    [[INC:%.*]] = add i32 [[TMP28]], 1
 // CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTPERMUTED_1_IV_I]], align 4
-// CHECK1-NEXT:    br label [[FOR_COND16]], !llvm.loop [[LOOP5:![0-9]+]]
+// CHECK1-NEXT:    br label [[FOR_COND16]], !llvm.loop [[LOOP4:![0-9]+]]
 // CHECK1:       for.end:
 // CHECK1-NEXT:    br label [[FOR_INC22:%.*]]
 // CHECK1:       for.inc22:
 // CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4
 // CHECK1-NEXT:    [[INC23:%.*]] = add i32 [[TMP29]], 1
 // CHECK1-NEXT:    store i32 [[INC23]], ptr [[DOTPERMUTED_0_IV_J]], align 4
-// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
+// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
 // CHECK1:       for.end24:
 // CHECK1-NEXT:    ret void
 //
@@ -342,7 +343,7 @@ extern "C" void foo10() {
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
 // CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP12]], 1
 // CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTPERMUTED_1_IV_I]], align 4
-// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
 // CHECK1:       for.end:
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
@@ -439,7 +440,7 @@ extern "C" void foo10() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_I]], align 4
 // CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP14]], 1
 // CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTPERMUTED_1_IV_I]], align 4
-// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
 // CHECK1:       for.end:
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
@@ -754,28 +755,28 @@ extern "C" void foo10() {
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTPERMUTED_3_IV_I]], align 4
 // CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP12]], 1
 // CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTPERMUTED_3_IV_I]], align 4
-// CHECK1-NEXT:    br label [[FOR_COND11]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK1-NEXT:    br label [[FOR_COND11]], !llvm.loop [[LOOP8:![0-9]+]]
 // CHECK1:       for.end:
 // CHECK1-NEXT:    br label [[FOR_INC16:%.*]]
 // CHECK1:       for.inc16:
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTPERMUTED_2_IV_L]], align 4
 // CHECK1-NEXT:    [[INC17:%.*]] = add nsw i32 [[TMP13]], 1
 // CHECK1-NEXT:    store i32 [[INC17]], ptr [[DOTPERMUTED_2_IV_L]], align 4
-// CHECK1-NEXT:    br label [[FOR_COND6]], !llvm.loop [[LOOP10:![0-9]+]]
+// CHECK1-NEXT:    br label [[FOR_COND6]], !llvm.loop [[LOOP9:![0-9]+]]
 // CHECK1:       for.end18:
 // CHECK1-NEXT:    br label [[FOR_INC19:%.*]]
 // CHECK1:       for.inc19:
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTPERMUTED_1_IV_K]], align 4
 // CHECK1-NEXT:    [[INC20:%.*]] = add nsw i32 [[TMP14]], 1
 // CHECK1-NEXT:    store i32 [[INC20]], ptr [[DOTPERMUTED_1_IV_K]], align 4
-// CHECK1-NEXT:    br label [[FOR_COND1]], !llvm.loop [[LOOP11:![0-9]+]]
+// CHECK1-NEXT:    br label [[FOR_COND1]], !llvm.loop [[LOOP10:![0-9]+]]
 // CHECK1:       for.end21:
 // CHECK1-NEXT:    br label [[FOR_INC22:%.*]]
 // CHECK1:       for.inc22:
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTPERMUTED_0_IV_J]], align 4
 // CHECK1-NEXT:    [[INC23:%.*]] = add nsw i32 [[TMP15]], 1
 // CHECK1-NEXT:    store i32 [[INC23]], ptr [[DOTPERMUTED_0_IV_J]], align 4
-// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
+// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]]
 // CHECK1:       for.end24:
 // CHECK1-NEXT:    ret void
 //
@@ -810,22 +811,21 @@ extern "C" void foo10() {
 // CHECK1-NEXT:    [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTNEW_STEP10:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTCAPTURE_EXPR_11:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTTILE_0_IV_K:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTCAPTURE_EXPR_14:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_16:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_17:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_22:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_16:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[J:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTFLOOR_0_IV_K:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[DOTTILE_0_IV_K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTTILE_CNT_0_IV_K:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTOMP_LB:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTOMP_UB:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[I49:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[J50:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[DOTFLOOR_0_IV_K51:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[DOTTILE_0_IV_K52:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I35:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[J36:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTFLOOR_0_IV_K37:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTTILE_CNT_0_IV_K38:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
 // CHECK1-NEXT:    store i32 [[START]], ptr [[START_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[END]], ptr [[END_ADDR]], align 4
@@ -863,630 +863,452 @@ extern "C" void foo10() {
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_11]], align 4
 // CHECK1-NEXT:    [[ADD15:%.*]] = add i32 [[TMP15]], 1
 // CHECK1-NEXT:    store i32 [[ADD15]], ptr [[DOTCAPTURE_EXPR_14]], align 4
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[_TMP2]], align 4
-// CHECK1-NEXT:    store i32 [[TMP16]], ptr [[DOTCAPTURE_EXPR_16]], align 4
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_11]], align 4
-// CHECK1-NEXT:    [[ADD18:%.*]] = add i32 [[TMP17]], 1
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[_TMP2]], align 4
-// CHECK1-NEXT:    [[ADD19:%.*]] = add i32 [[TMP18]], 32
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ADD18]], [[ADD19]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
-// CHECK1:       cond.true:
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_11]], align 4
-// CHECK1-NEXT:    [[ADD20:%.*]] = add i32 [[TMP19]], 1
-// CHECK1-NEXT:    br label [[COND_END:%.*]]
-// CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[_TMP2]], align 4
-// CHECK1-NEXT:    [[ADD21:%.*]] = add i32 [[TMP20]], 32
-// CHECK1-NEXT:    br label [[COND_END]]
-// CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[ADD20]], [[COND_TRUE]] ], [ [[ADD21]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTCAPTURE_EXPR_17]], align 4
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK1-NEXT:    [[SUB23:%.*]] = sub i32 [[TMP21]], [[TMP22]]
-// CHECK1-NEXT:    [[SUB24:%.*]] = sub i32 [[SUB23]], 1
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
-// CHECK1-NEXT:    [[ADD25:%.*]] = add i32 [[SUB24]], [[TMP23]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
-// CHECK1-NEXT:    [[DIV26:%.*]] = udiv i32 [[ADD25]], [[TMP24]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i32 [[DIV26]] to i64
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32,...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/191114