[llvm-branch-commits] [llvm] 0ea3749 - [LV] Set up branch from middle block earlier.

Sun Dec 27 10:26:35 PST 2020

Author: Florian Hahn
Date: 2020-12-27T18:21:12Z
New Revision: 0ea3749b3cde16d70c5f66357b623c8edf521f2b

URL: https://github.com/llvm/llvm-project/commit/0ea3749b3cde16d70c5f66357b623c8edf521f2b
DIFF: https://github.com/llvm/llvm-project/commit/0ea3749b3cde16d70c5f66357b623c8edf521f2b.diff

LOG: [LV] Set up branch from middle block earlier.

Previously the branch from the middle block to the scalar preheader & exit
was being set-up at the end of skeleton creation in completeLoopSkeleton.
Inserting SCEV or runtime checks may result in LCSSA phis being created,
if they are required. Adjusting branches afterwards may break those
PHIs.

To avoid this, we can instead create the branch from the middle block
to the exit after we created the middle block, so we have the final CFG
before potentially adjusting/creating PHIs.

This fixes a crash for the included test case. For the non-crashing
case, this is almost a NFC with respect to the generated code. The
only change is the order of the predecessors of the involved branch
targets.

Note an assertion was moved from LoopVersioning() to
LoopVersioning::versionLoop. Adjusting the branches means loop-simplify
form may be broken before constructing LoopVersioning. But LV only uses
LoopVersioning to annotate the loop instructions with !noalias metadata,
which does not require loop-simplify form.

This is a fix for an existing issue uncovered by D93317.

Added: 
    llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll

Modified: 
    llvm/lib/Transforms/Utils/LoopVersioning.cpp
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
    llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index 06e204060cb0..b54aee35d56d 100644

--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -45,11 +45,13 @@ LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI,
       Preds(LAI.getPSE().getUnionPredicate()), LAI(LAI), LI(LI), DT(DT),
       SE(SE) {
   assert(L->getExitBlock() && "No single exit block");
-  assert(L->isLoopSimplifyForm() && "Loop is not in loop-simplify form");
 }
 
 void LoopVersioning::versionLoop(
     const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
+  assert(VersionedLoop->isLoopSimplifyForm() &&
+         "Loop is not in loop-simplify form");
+
   Instruction *FirstCheckInst;
   Instruction *MemRuntimeCheck;
   Value *SCEVRuntimeCheck;

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6ab8e5884a76..5889d5e55339 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3311,6 +3311,16 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   LoopScalarPreHeader =
       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
                  nullptr, Twine(Prefix) + "scalar.ph");
+
+  // Set up branch from middle block to the exit and scalar preheader blocks.
+  // completeLoopSkeleton will update the condition to use an iteration check,
+  // if required to decide whether to execute the remainder.
+  BranchInst *BrInst =
+      BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
+  auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
+  BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
+  ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
+
   // We intentionally don't let SplitBlock to update LoopInfo since
   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
   // LoopVectorBody is explicitly added to the correct place few lines later.
@@ -3419,24 +3429,19 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
   // all of the iterations in the first vector loop.
   // If (N - N%VF) == N, then we *don't* need to run the remainder.
   // If tail is to be folded, we know we don't need to run the remainder.
-  Value *CmpN = Builder.getTrue();
   if (!Cost->foldTailByMasking()) {
-    CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
-                           VectorTripCount, "cmp.n",
-                           LoopMiddleBlock->getTerminator());
+    Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+                                        Count, VectorTripCount, "cmp.n",
+                                        LoopMiddleBlock->getTerminator());
 
     // Here we use the same DebugLoc as the scalar loop latch terminator instead
     // of the corresponding compare because they may have ended up with
     // 
diff erent line numbers and we want to avoid awkward line stepping while
     // debugging. Eg. if the compare has got a line number inside the loop.
-    cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchTerm->getDebugLoc());
+    CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
+    cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
   }
 
-  BranchInst *BrInst =
-      BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
-  BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
-  ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
-
   // Get ready to start creating new instructions into the vectorized body.
   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
          "Inconsistent vector loop preheader");

diff  --git a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
index 4b312c8b94fe..4e14c29ddc9f 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
@@ -61,13 +61,13 @@ define void @func_21() {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 6
-; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 6, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
@@ -79,7 +79,7 @@ define void @func_21() {
 ; CHECK-NEXT:    store i32 [[SCALAR_RECUR]], i32* [[B_PTR]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 5
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop !2
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 5a250984b1f0..f0075d91f22b 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -22,7 +22,7 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 ; CHECK:       middle.block:
 ; CHECK:         %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
 ; CHECK:       scalar.ph:
-; CHECK:         %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %for.preheader ]
+; CHECK:         %scalar.recur.init = phi i32 [ %pre_load, %vector.memcheck ], [ %pre_load, %for.preheader ], [ %vector.recur.extract, %middle.block ]
 ; CHECK:       scalar.body:
 ; CHECK:         %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
 ;
@@ -79,7 +79,7 @@ for.exit:
 ; CHECK:       middle.block:
 ; CHECK:         %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
 ; CHECK:       scalar.ph:
-; CHECK:         %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %for.preheader ]
+; CHECK:         %scalar.recur.init = phi i32 [ %.pre, %for.preheader ], [ %vector.recur.extract, %middle.block ]
 ; CHECK:       scalar.body:
 ; CHECK:         %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
 ;
@@ -147,7 +147,7 @@ scalar.body:
 ; CHECK:       middle.block:
 ; CHECK:         %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3
 ; CHECK:       scalar.ph:
-; CHECK:         %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %for.preheader ]
+; CHECK:         %scalar.recur.init = phi i16 [ %0, %vector.memcheck ], [ %0, %for.preheader ], [ %vector.recur.extract, %middle.block ]
 ; CHECK:       scalar.body:
 ; CHECK:         %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll
new file mode 100644
index 000000000000..3b0ba312fde8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s
+
+; Make sure LV does not crash when creating runtime checks involving values from
+; other loops.
+define i16 @test(i16** %arg, i64 %N) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER:%.*]]
+; CHECK:       outer:
+; CHECK-NEXT:    [[L_1:%.*]] = load i16*, i16** [[ARG:%.*]], align 8
+; CHECK-NEXT:    [[L_2:%.*]] = load i16*, i16** [[ARG]], align 8
+; CHECK-NEXT:    [[C_1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C_1]], label [[OUTER_BACKEDGE:%.*]], label [[INNER_PREHEADER:%.*]]
+; CHECK:       outer.backedge:
+; CHECK-NEXT:    br label [[OUTER]]
+; CHECK:       inner.preheader:
+; CHECK-NEXT:    br label [[INNER:%.*]]
+; CHECK:       inner:
+; CHECK-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C_2]], label [[OUTER_LATCH:%.*]], label [[INNER_BB:%.*]]
+; CHECK:       inner.bb:
+; CHECK-NEXT:    [[C_3:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C_3]], label [[LOOP3_PREHEADER:%.*]], label [[INNER_LATCH:%.*]]
+; CHECK:       loop3.preheader:
+; CHECK-NEXT:    [[L_1_LCSSA11:%.*]] = phi i16* [ [[L_1]], [[INNER_BB]] ]
+; CHECK-NEXT:    [[L_1_LCSSA:%.*]] = phi i16* [ [[L_1]], [[INNER_BB]] ]
+; CHECK-NEXT:    [[L_2_LCSSA:%.*]] = phi i16* [ [[L_2]], [[INNER_BB]] ]
+; CHECK-NEXT:    [[L_2_LCSSA4:%.*]] = bitcast i16* [[L_2_LCSSA]] to i8*
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[L_2_LCSSA4]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i16, i16* [[L_1_LCSSA]], i64 1
+; CHECK-NEXT:    [[SCEVGEP9:%.*]] = bitcast i16* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[N]], 2
+; CHECK-NEXT:    [[SCEVGEP10:%.*]] = getelementptr i16, i16* [[L_1_LCSSA11]], i64 [[TMP1]]
+; CHECK-NEXT:    [[SCEVGEP1013:%.*]] = bitcast i16* [[SCEVGEP10]] to i8*
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[L_2_LCSSA4]], [[SCEVGEP1013]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP9]], [[UGLYGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[L_1]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[TMP5]] to <2 x i16>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP6]], align 2, !alias.scope !0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[L_2]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    store i16 [[TMP8]], i16* [[TMP7]], align 2, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    store i16 [[TMP9]], i16* [[TMP7]], align 2, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[L_18:%.*]] = phi i16* [ [[L_1_LCSSA]], [[VECTOR_MEMCHECK]] ], [ [[L_1_LCSSA]], [[LOOP3_PREHEADER]] ], [ [[L_1_LCSSA]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[L_23:%.*]] = phi i16* [ [[L_2_LCSSA]], [[VECTOR_MEMCHECK]] ], [ [[L_2_LCSSA]], [[LOOP3_PREHEADER]] ], [ [[L_2_LCSSA]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP3_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[LOOP3:%.*]]
+; CHECK:       inner.latch:
+; CHECK-NEXT:    [[C_4:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C_4]], label [[EXIT_LOOPEXIT1:%.*]], label [[INNER]]
+; CHECK:       outer.latch:
+; CHECK-NEXT:    br label [[OUTER_BACKEDGE]]
+; CHECK:       loop3:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP3]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[C_5:%.*]] = icmp ult i64 [[IV]], [[N]]
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i16, i16* [[L_18]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[LOOP_L_1:%.*]] = load i16, i16* [[GEP_1]], align 2
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i16, i16* [[L_23]], i64 0
+; CHECK-NEXT:    store i16 [[LOOP_L_1]], i16* [[GEP_2]], align 2
+; CHECK-NEXT:    br i1 [[C_5]], label [[LOOP3]], label [[EXIT_LOOPEXIT]], [[LOOP7:!llvm.loop !.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    [[L_17:%.*]] = phi i16* [ [[L_1_LCSSA]], [[MIDDLE_BLOCK]] ], [ [[L_18]], [[LOOP3]] ]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit.loopexit1:
+; CHECK-NEXT:    [[L_1_LCSSA5:%.*]] = phi i16* [ [[L_1]], [[INNER_LATCH]] ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[L_16:%.*]] = phi i16* [ [[L_1_LCSSA5]], [[EXIT_LOOPEXIT1]] ], [ [[L_17]], [[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    [[L_3:%.*]] = load i16, i16* [[L_16]], align 2
+; CHECK-NEXT:    ret i16 [[L_3]]
+;
+entry:
+  br label %outer
+
+outer:
+  %l.1 = load i16*, i16** %arg, align 8
+  %l.2 = load i16*, i16** %arg, align 8
+  %c.1 = call i1 @cond()
+  br i1 %c.1, label %outer, label %inner
+
+inner:                                              ; preds = %bb15, %bb1
+  %c.2 = call i1 @cond()
+  br i1 %c.2, label %outer.latch, label %inner.bb
+
+inner.bb:                                              ; preds = %bb3
+  %c.3 = call i1 @cond()
+  br i1 %c.3, label %loop3, label %inner.latch
+
+inner.latch:                                             ; preds = %bb4
+  %c.4 = call i1 @cond()
+  br i1 %c.4, label %exit, label %inner
+
+outer.latch:                                             ; preds = %bb3
+  br label %outer
+
+loop3:                                              ; preds = %bb9, %bb4
+  %iv = phi i64 [ %iv.next, %loop3 ], [ 0, %inner.bb ]
+  %iv.next = add nsw nuw i64 %iv, 1
+  %c.5  = icmp ult i64 %iv, %N
+  %gep.1 = getelementptr inbounds i16, i16* %l.1, i64 %iv.next
+  %loop.l.1 = load i16, i16* %gep.1, align 2
+  %gep.2 = getelementptr inbounds i16, i16* %l.2, i64 0
+  store i16 %loop.l.1, i16* %gep.2 , align 2
+  br i1 %c.5, label %loop3, label %exit
+
+exit:                                             ; preds = %bb15, %bb5
+  %l.3 = load i16, i16* %l.1, align 2
+  ret i16 %l.3
+}
+
+declare i1 @cond()