[llvm] [X86] Reduce znver3/4 LoopMicroOpBufferSize to practical loop unrolling values (PR #91340)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Thu May 16 02:45:47 PDT 2024


https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/91340

>From e882246c6095962a9776200ec8d09189aab0f8e7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 7 May 2024 15:29:29 +0100
Subject: [PATCH] [X86] Reduce znver3/4 LoopMicroOpBufferSize to practical
 values

The znver3/4 scheduler models have previously associated the LoopMicroOpBufferSize with the maximum size of their op caches, and when this led to quadratic complexity issues this were reduced to a value of 512 uops, based mainly on compilation time and not its effectiveness on runtime performance.

>From a runtime performance POV, a large LoopMicroOpBufferSize leads to a higher number of loop unrolls, meaning the cpu has to rely on the frontend decode rate (4 ins/cy max) for much longer to fill the op cache before looping begins and we make use of the faster op cache rate (8/9 ops/cy).

This patch proposes we instead cap the size of the LoopMicroOpBufferSize based off the maximum rate from the op cache (znver3 = 8op/cy, znver4 = 9op/cy) and the branch misprediction penalty from the opcache (~12cy) as a estimate of the useful number of ops we can unroll a loop by before mispredictions are likely to cause stalls. This isn't a perfect metric, but does try to be closer to the spirit of how we use LoopMicroOpBufferSize in the compiler vs the size of a similar naming buffer in the cpu.
---
 llvm/lib/Target/X86/X86ScheduleZnver3.td      |  11 +-
 llvm/lib/Target/X86/X86ScheduleZnver4.td      |  16 +-
 llvm/test/Transforms/LoopUnroll/X86/znver3.ll | 280 +++---------------
 3 files changed, 52 insertions(+), 255 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td
index 2e87d5262818c..cbf1de8408798 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver3.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td
@@ -33,13 +33,10 @@ def Znver3Model : SchedMachineModel {
   // The op cache is organized as an associative cache with 64 sets and 8 ways.
   // At each set-way intersection is an entry containing up to 8 macro ops.
   // The maximum capacity of the op cache is 4K ops.
-  // Agner, 22.5 µop cache
-  // The size of the µop cache is big enough for holding most critical loops.
-  // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity,
-  //        with large values here the compilation of certain loops
-  //        ends up taking way too long.
-  // let LoopMicroOpBufferSize = 4096;
-  let LoopMicroOpBufferSize = 512;
+  // Assuming a maximum dispatch of 8 ops/cy and a mispredict cost of 12cy from
+  // the op-cache, we limit the loop buffer to 8*12 = 96 to avoid loop unrolling
+  // leading to excessive filling of the op-cache from frontend.
+  let LoopMicroOpBufferSize = 96;
   // AMD SOG 19h, 2.6.2 L1 Data Cache
   // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
   // AMD SOG 19h, 2.12 L1 Data Cache
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index dac4d8422582a..7107dbc63e279 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -28,17 +28,11 @@ def Znver4Model : SchedMachineModel {
   // AMD SOG 19h, 2.9.1 Op Cache
   // The op cache is organized as an associative cache with 64 sets and 8 ways.
   // At each set-way intersection is an entry containing up to 8 macro ops.
-  // The maximum capacity of the op cache is 4K ops.
-  // Agner, 22.5 µop cache
-  // The size of the µop cache is big enough for holding most critical loops.
-  // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity,
-  //        with large values here the compilation of certain loops
-  //        ends up taking way too long.
-  // Ideally for znver4, we should have 6.75K. However we don't add that
-  // considerting the impact compile time and prefer using default values 
-  // instead.
-  // Retaining minimal value to influence unrolling as we did for znver3.
-  let LoopMicroOpBufferSize = 512;
+  // The maximum capacity of the op cache is 6.75K ops.
+  // Assuming a maximum dispatch of 9 ops/cy and a mispredict cost of 12cy from
+  // the op-cache, we limit the loop buffer to 9*12 = 108 to avoid loop
+  // unrolling leading to excessive filling of the op-cache from frontend.
+  let LoopMicroOpBufferSize = 108;
   // AMD SOG 19h, 2.6.2 L1 Data Cache
   // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
   // AMD SOG 19h, 2.12 L1 Data Cache
diff --git a/llvm/test/Transforms/LoopUnroll/X86/znver3.ll b/llvm/test/Transforms/LoopUnroll/X86/znver3.ll
index 467c57906d888..b1f1d7d814e6c 100644
--- a/llvm/test/Transforms/LoopUnroll/X86/znver3.ll
+++ b/llvm/test/Transforms/LoopUnroll/X86/znver3.ll
@@ -9,8 +9,8 @@ define i32 @test(ptr %ary) "target-cpu"="znver3" {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_31:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_NEXT_31:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_127:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_NEXT_127:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[SUM_NEXT:%.*]] = add nsw i32 [[VAL]], [[SUM]]
@@ -73,76 +73,12 @@ define i32 @test(ptr %ary) "target-cpu"="znver3" {
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_14:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 15
 ; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_NEXT_14]]
 ; CHECK-NEXT:    [[VAL_15:%.*]] = load i32, ptr [[ARRAYIDX_15]], align 4
-; CHECK-NEXT:    [[SUM_NEXT_15:%.*]] = add nsw i32 [[VAL_15]], [[SUM_NEXT_14]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_15:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 16
-; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_NEXT_15]]
-; CHECK-NEXT:    [[VAL_16:%.*]] = load i32, ptr [[ARRAYIDX_16]], align 4
-; CHECK-NEXT:    [[SUM_NEXT_16:%.*]] = add nsw i32 [[VAL_16]], [[SUM_NEXT_15]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_16:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 17
-; CHECK-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_NEXT_16]]
-; CHECK-NEXT:    [[VAL_17:%.*]] = load i32, ptr [[ARRAYIDX_17]], align 4
-; CHECK-NEXT:    [[SUM_NEXT_17:%.*]] = add nsw i32 [[VAL_17]], [[SUM_NEXT_16]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_17:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 18
-; CHECK-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_NEXT_17]]
-; CHECK-NEXT:    [[VAL_18:%.*]] = load i32, ptr [[ARRAYIDX_18]], align 4
-; CHECK-NEXT:    [[SUM_NEXT_18:%.*]] = add nsw i32 [[VAL_18]], [[SUM_NEXT_17]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_18:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 19
-; CHECK-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_NEXT_18]]
-; CHECK-NEXT:    [[VAL_19:%.*]] = load i32, ptr [[ARRAYIDX_19]], align 4
-; CHECK-NEXT:    [[SUM_NEXT_19:%.*]] = add nsw i32 [[VAL_19]], [[SUM_NEXT_18]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_19:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 20
-; CHECK-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_NEXT_19]]
-; CHECK-NEXT:    [[VAL_20:%.*]] = load i32, ptr [[ARRAYIDX_20]], align 4
-; CHECK-NEXT:    [[SUM_NEXT_20:%.*]] = add nsw i32 [[VAL_20]], [[SUM_NEXT_19]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_20:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 21
-; CHECK-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_NEXT_20]]
-; CHECK-NEXT:    [[VAL_21:%.*]] = load i32, ptr [[ARRAYIDX_21]], align 4
-; CHECK-NEXT:    [[SUM_NEXT_21:%.*]] = add nsw i32 [[VAL_21]], [[SUM_NEXT_20]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_21:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 22
-; CHECK-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_NEXT_21]]
-; CHECK-NEXT:    [[VAL_22:%.*]] = load i32, ptr [[ARRAYIDX_22]], align 4
-; CHECK-NEXT:    [[SUM_NEXT_22:%.*]] = add nsw i32 [[VAL_22]], [[SUM_NEXT_21]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_22:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 23
-; CHECK-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_NEXT_22]]
-; CHECK-NEXT:    [[VAL_23:%.*]] = load i32, ptr [[ARRAYIDX_23]], align 4
-; CHECK-NEXT:    [[SUM_NEXT_23:%.*]] = add nsw i32 [[VAL_23]], [[SUM_NEXT_22]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_23:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 24
-; CHECK-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_NEXT_23]]
-; CHECK-NEXT:    [[VAL_24:%.*]] = load i32, ptr [[ARRAYIDX_24]], align 4
-; CHECK-NEXT:    [[SUM_NEXT_24:%.*]] = add nsw i32 [[VAL_24]], [[SUM_NEXT_23]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_24:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 25
-; CHECK-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_NEXT_24]]
-; CHECK-NEXT:    [[VAL_25:%.*]] = load i32, ptr [[ARRAYIDX_25]], align 4
-; CHECK-NEXT:    [[SUM_NEXT_25:%.*]] = add nsw i32 [[VAL_25]], [[SUM_NEXT_24]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_25:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 26
-; CHECK-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_NEXT_25]]
-; CHECK-NEXT:    [[VAL_26:%.*]] = load i32, ptr [[ARRAYIDX_26]], align 4
-; CHECK-NEXT:    [[SUM_NEXT_26:%.*]] = add nsw i32 [[VAL_26]], [[SUM_NEXT_25]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_26:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 27
-; CHECK-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_NEXT_26]]
-; CHECK-NEXT:    [[VAL_27:%.*]] = load i32, ptr [[ARRAYIDX_27]], align 4
-; CHECK-NEXT:    [[SUM_NEXT_27:%.*]] = add nsw i32 [[VAL_27]], [[SUM_NEXT_26]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_27:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 28
-; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_NEXT_27]]
-; CHECK-NEXT:    [[VAL_28:%.*]] = load i32, ptr [[ARRAYIDX_28]], align 4
-; CHECK-NEXT:    [[SUM_NEXT_28:%.*]] = add nsw i32 [[VAL_28]], [[SUM_NEXT_27]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_28:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 29
-; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_NEXT_28]]
-; CHECK-NEXT:    [[VAL_29:%.*]] = load i32, ptr [[ARRAYIDX_29]], align 4
-; CHECK-NEXT:    [[SUM_NEXT_29:%.*]] = add nsw i32 [[VAL_29]], [[SUM_NEXT_28]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_29:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 30
-; CHECK-NEXT:    [[ARRAYIDX_30:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_NEXT_29]]
-; CHECK-NEXT:    [[VAL_30:%.*]] = load i32, ptr [[ARRAYIDX_30]], align 4
-; CHECK-NEXT:    [[SUM_NEXT_30:%.*]] = add nsw i32 [[VAL_30]], [[SUM_NEXT_29]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_30:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 31
-; CHECK-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_NEXT_30]]
-; CHECK-NEXT:    [[VAL_31:%.*]] = load i32, ptr [[ARRAYIDX_31]], align 4
-; CHECK-NEXT:    [[SUM_NEXT_31]] = add nsw i32 [[VAL_31]], [[SUM_NEXT_30]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_31]] = add nuw nsw i64 [[INDVARS_IV]], 32
-; CHECK-NEXT:    [[EXITCOND_NOT_31:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_31]], 8192
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT_31]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    [[SUM_NEXT_127]] = add nsw i32 [[VAL_15]], [[SUM_NEXT_14]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_127]] = add nuw nsw i64 [[INDVARS_IV]], 16
+; CHECK-NEXT:    [[EXITCOND_NOT_127:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_127]], 8192
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_127]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT_31]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT_127]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_NEXT_LCSSA]]
 ;
 entry:
@@ -166,198 +102,68 @@ define i32 @test2(ptr %ary, i64 %n) "target-cpu"="znver3" {
 ; CHECK-LABEL: define i32 @test2(
 ; CHECK-SAME: ptr [[ARY:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
-; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
-; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_COND_CLEANUP_UNR_LCSSA:%.*]], label [[ENTRY_NEW:%.*]]
-; CHECK:       entry.new:
-; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY_NEW]] ], [ [[SUM_NEXT_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[DUMMY1:%.*]] = mul i32 [[VAL]], [[VAL]]
-; CHECK-NEXT:    [[DUMMY2:%.*]] = mul i32 [[DUMMY1]], [[DUMMY1]]
-; CHECK-NEXT:    [[DUMMY3:%.*]] = mul i32 [[DUMMY2]], [[DUMMY2]]
-; CHECK-NEXT:    [[DUMMY4:%.*]] = mul i32 [[DUMMY3]], [[DUMMY3]]
+; CHECK-NEXT:    [[INDVARS_IV_EPIL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_EPIL:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_EPIL:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_NEXT_EPIL:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_EPIL]]
+; CHECK-NEXT:    [[VAL_EPIL:%.*]] = load i32, ptr [[ARRAYIDX_EPIL]], align 4
+; CHECK-NEXT:    [[DUMMY21:%.*]] = mul i32 [[VAL_EPIL]], [[VAL_EPIL]]
+; CHECK-NEXT:    [[DUMMY27:%.*]] = mul i32 [[DUMMY21]], [[DUMMY21]]
+; CHECK-NEXT:    [[DUMMY28:%.*]] = mul i32 [[DUMMY27]], [[DUMMY27]]
+; CHECK-NEXT:    [[DUMMY4:%.*]] = mul i32 [[DUMMY28]], [[DUMMY28]]
 ; CHECK-NEXT:    [[DUMMY5:%.*]] = mul i32 [[DUMMY4]], [[DUMMY4]]
 ; CHECK-NEXT:    [[DUMMY6:%.*]] = mul i32 [[DUMMY5]], [[DUMMY5]]
 ; CHECK-NEXT:    [[DUMMY7:%.*]] = mul i32 [[DUMMY6]], [[DUMMY6]]
 ; CHECK-NEXT:    [[DUMMY8:%.*]] = mul i32 [[DUMMY7]], [[DUMMY7]]
 ; CHECK-NEXT:    [[DUMMY9:%.*]] = mul i32 [[DUMMY8]], [[DUMMY8]]
 ; CHECK-NEXT:    [[DUMMY10:%.*]] = mul i32 [[DUMMY9]], [[DUMMY9]]
-; CHECK-NEXT:    [[DUMMY11:%.*]] = mul i32 [[DUMMY10]], [[DUMMY10]]
-; CHECK-NEXT:    [[DUMMY12:%.*]] = mul i32 [[DUMMY11]], [[DUMMY11]]
-; CHECK-NEXT:    [[DUMMY13:%.*]] = mul i32 [[DUMMY12]], [[DUMMY12]]
-; CHECK-NEXT:    [[DUMMY14:%.*]] = mul i32 [[DUMMY13]], [[DUMMY13]]
-; CHECK-NEXT:    [[DUMMY15:%.*]] = mul i32 [[DUMMY14]], [[DUMMY14]]
+; CHECK-NEXT:    [[DUMMY29:%.*]] = mul i32 [[DUMMY10]], [[DUMMY10]]
+; CHECK-NEXT:    [[DUMMY30:%.*]] = mul i32 [[DUMMY29]], [[DUMMY29]]
+; CHECK-NEXT:    [[DUMMY39:%.*]] = mul i32 [[DUMMY30]], [[DUMMY30]]
+; CHECK-NEXT:    [[DUMMY40:%.*]] = mul i32 [[DUMMY39]], [[DUMMY39]]
+; CHECK-NEXT:    [[DUMMY15:%.*]] = mul i32 [[DUMMY40]], [[DUMMY40]]
 ; CHECK-NEXT:    [[DUMMY16:%.*]] = mul i32 [[DUMMY15]], [[DUMMY15]]
 ; CHECK-NEXT:    [[DUMMY17:%.*]] = mul i32 [[DUMMY16]], [[DUMMY16]]
 ; CHECK-NEXT:    [[DUMMY18:%.*]] = mul i32 [[DUMMY17]], [[DUMMY17]]
 ; CHECK-NEXT:    [[DUMMY19:%.*]] = mul i32 [[DUMMY18]], [[DUMMY18]]
 ; CHECK-NEXT:    [[DUMMY20:%.*]] = mul i32 [[DUMMY19]], [[DUMMY19]]
-; CHECK-NEXT:    [[DUMMY21:%.*]] = mul i32 [[DUMMY20]], [[DUMMY20]]
-; CHECK-NEXT:    [[DUMMY22:%.*]] = mul i32 [[DUMMY21]], [[DUMMY21]]
+; CHECK-NEXT:    [[VAL:%.*]] = mul i32 [[DUMMY20]], [[DUMMY20]]
+; CHECK-NEXT:    [[DUMMY1:%.*]] = mul i32 [[VAL]], [[VAL]]
+; CHECK-NEXT:    [[DUMMY2:%.*]] = mul i32 [[DUMMY1]], [[DUMMY1]]
+; CHECK-NEXT:    [[DUMMY3:%.*]] = mul i32 [[DUMMY2]], [[DUMMY2]]
+; CHECK-NEXT:    [[DUMMY41:%.*]] = mul i32 [[DUMMY3]], [[DUMMY3]]
+; CHECK-NEXT:    [[DUMMY26:%.*]] = mul i32 [[DUMMY41]], [[DUMMY41]]
+; CHECK-NEXT:    [[DUMMY11:%.*]] = mul i32 [[DUMMY26]], [[DUMMY26]]
+; CHECK-NEXT:    [[DUMMY12:%.*]] = mul i32 [[DUMMY11]], [[DUMMY11]]
+; CHECK-NEXT:    [[DUMMY13:%.*]] = mul i32 [[DUMMY12]], [[DUMMY12]]
+; CHECK-NEXT:    [[DUMMY14:%.*]] = mul i32 [[DUMMY13]], [[DUMMY13]]
+; CHECK-NEXT:    [[DUMMY31:%.*]] = mul i32 [[DUMMY14]], [[DUMMY14]]
+; CHECK-NEXT:    [[DUMMY32:%.*]] = mul i32 [[DUMMY31]], [[DUMMY31]]
+; CHECK-NEXT:    [[DUMMY22:%.*]] = mul i32 [[DUMMY32]], [[DUMMY32]]
 ; CHECK-NEXT:    [[DUMMY23:%.*]] = mul i32 [[DUMMY22]], [[DUMMY22]]
 ; CHECK-NEXT:    [[DUMMY24:%.*]] = mul i32 [[DUMMY23]], [[DUMMY23]]
 ; CHECK-NEXT:    [[DUMMY25:%.*]] = mul i32 [[DUMMY24]], [[DUMMY24]]
-; CHECK-NEXT:    [[DUMMY26:%.*]] = mul i32 [[DUMMY25]], [[DUMMY25]]
-; CHECK-NEXT:    [[DUMMY27:%.*]] = mul i32 [[DUMMY26]], [[DUMMY26]]
-; CHECK-NEXT:    [[DUMMY28:%.*]] = mul i32 [[DUMMY27]], [[DUMMY27]]
-; CHECK-NEXT:    [[DUMMY29:%.*]] = mul i32 [[DUMMY28]], [[DUMMY28]]
-; CHECK-NEXT:    [[DUMMY30:%.*]] = mul i32 [[DUMMY29]], [[DUMMY29]]
-; CHECK-NEXT:    [[DUMMY31:%.*]] = mul i32 [[DUMMY30]], [[DUMMY30]]
-; CHECK-NEXT:    [[DUMMY32:%.*]] = mul i32 [[DUMMY31]], [[DUMMY31]]
-; CHECK-NEXT:    [[DUMMY33:%.*]] = mul i32 [[DUMMY32]], [[DUMMY32]]
+; CHECK-NEXT:    [[DUMMY37:%.*]] = mul i32 [[DUMMY25]], [[DUMMY25]]
+; CHECK-NEXT:    [[DUMMY38:%.*]] = mul i32 [[DUMMY37]], [[DUMMY37]]
+; CHECK-NEXT:    [[DUMMY33:%.*]] = mul i32 [[DUMMY38]], [[DUMMY38]]
 ; CHECK-NEXT:    [[DUMMY34:%.*]] = mul i32 [[DUMMY33]], [[DUMMY33]]
 ; CHECK-NEXT:    [[DUMMY35:%.*]] = mul i32 [[DUMMY34]], [[DUMMY34]]
 ; CHECK-NEXT:    [[DUMMY36:%.*]] = mul i32 [[DUMMY35]], [[DUMMY35]]
-; CHECK-NEXT:    [[DUMMY37:%.*]] = mul i32 [[DUMMY36]], [[DUMMY36]]
-; CHECK-NEXT:    [[DUMMY38:%.*]] = mul i32 [[DUMMY37]], [[DUMMY37]]
-; CHECK-NEXT:    [[DUMMY39:%.*]] = mul i32 [[DUMMY38]], [[DUMMY38]]
-; CHECK-NEXT:    [[DUMMY40:%.*]] = mul i32 [[DUMMY39]], [[DUMMY39]]
-; CHECK-NEXT:    [[DUMMY41:%.*]] = mul i32 [[DUMMY40]], [[DUMMY40]]
-; CHECK-NEXT:    [[DUMMY42:%.*]] = mul i32 [[DUMMY41]], [[DUMMY41]]
-; CHECK-NEXT:    [[DUMMY43:%.*]] = mul i32 [[DUMMY42]], [[DUMMY42]]
+; CHECK-NEXT:    [[DUMMY43:%.*]] = mul i32 [[DUMMY36]], [[DUMMY36]]
 ; CHECK-NEXT:    [[DUMMY44:%.*]] = mul i32 [[DUMMY43]], [[DUMMY43]]
 ; CHECK-NEXT:    [[DUMMY45:%.*]] = mul i32 [[DUMMY44]], [[DUMMY44]]
 ; CHECK-NEXT:    [[DUMMY46:%.*]] = mul i32 [[DUMMY45]], [[DUMMY45]]
 ; CHECK-NEXT:    [[DUMMY47:%.*]] = mul i32 [[DUMMY46]], [[DUMMY46]]
 ; CHECK-NEXT:    [[DUMMY48:%.*]] = mul i32 [[DUMMY47]], [[DUMMY47]]
 ; CHECK-NEXT:    [[DUMMY49:%.*]] = mul i32 [[DUMMY48]], [[DUMMY48]]
-; CHECK-NEXT:    [[DUMMY50:%.*]] = mul i32 [[DUMMY49]], [[DUMMY49]]
-; CHECK-NEXT:    [[SUM_NEXT:%.*]] = add nsw i32 [[DUMMY50]], [[SUM]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    [[VAL_1:%.*]] = load i32, ptr [[ARRAYIDX_1]], align 4
-; CHECK-NEXT:    [[DUMMY1_1:%.*]] = mul i32 [[VAL_1]], [[VAL_1]]
-; CHECK-NEXT:    [[DUMMY2_1:%.*]] = mul i32 [[DUMMY1_1]], [[DUMMY1_1]]
-; CHECK-NEXT:    [[DUMMY3_1:%.*]] = mul i32 [[DUMMY2_1]], [[DUMMY2_1]]
-; CHECK-NEXT:    [[DUMMY4_1:%.*]] = mul i32 [[DUMMY3_1]], [[DUMMY3_1]]
-; CHECK-NEXT:    [[DUMMY5_1:%.*]] = mul i32 [[DUMMY4_1]], [[DUMMY4_1]]
-; CHECK-NEXT:    [[DUMMY6_1:%.*]] = mul i32 [[DUMMY5_1]], [[DUMMY5_1]]
-; CHECK-NEXT:    [[DUMMY7_1:%.*]] = mul i32 [[DUMMY6_1]], [[DUMMY6_1]]
-; CHECK-NEXT:    [[DUMMY8_1:%.*]] = mul i32 [[DUMMY7_1]], [[DUMMY7_1]]
-; CHECK-NEXT:    [[DUMMY9_1:%.*]] = mul i32 [[DUMMY8_1]], [[DUMMY8_1]]
-; CHECK-NEXT:    [[DUMMY10_1:%.*]] = mul i32 [[DUMMY9_1]], [[DUMMY9_1]]
-; CHECK-NEXT:    [[DUMMY11_1:%.*]] = mul i32 [[DUMMY10_1]], [[DUMMY10_1]]
-; CHECK-NEXT:    [[DUMMY12_1:%.*]] = mul i32 [[DUMMY11_1]], [[DUMMY11_1]]
-; CHECK-NEXT:    [[DUMMY13_1:%.*]] = mul i32 [[DUMMY12_1]], [[DUMMY12_1]]
-; CHECK-NEXT:    [[DUMMY14_1:%.*]] = mul i32 [[DUMMY13_1]], [[DUMMY13_1]]
-; CHECK-NEXT:    [[DUMMY15_1:%.*]] = mul i32 [[DUMMY14_1]], [[DUMMY14_1]]
-; CHECK-NEXT:    [[DUMMY16_1:%.*]] = mul i32 [[DUMMY15_1]], [[DUMMY15_1]]
-; CHECK-NEXT:    [[DUMMY17_1:%.*]] = mul i32 [[DUMMY16_1]], [[DUMMY16_1]]
-; CHECK-NEXT:    [[DUMMY18_1:%.*]] = mul i32 [[DUMMY17_1]], [[DUMMY17_1]]
-; CHECK-NEXT:    [[DUMMY19_1:%.*]] = mul i32 [[DUMMY18_1]], [[DUMMY18_1]]
-; CHECK-NEXT:    [[DUMMY20_1:%.*]] = mul i32 [[DUMMY19_1]], [[DUMMY19_1]]
-; CHECK-NEXT:    [[DUMMY21_1:%.*]] = mul i32 [[DUMMY20_1]], [[DUMMY20_1]]
-; CHECK-NEXT:    [[DUMMY22_1:%.*]] = mul i32 [[DUMMY21_1]], [[DUMMY21_1]]
-; CHECK-NEXT:    [[DUMMY23_1:%.*]] = mul i32 [[DUMMY22_1]], [[DUMMY22_1]]
-; CHECK-NEXT:    [[DUMMY24_1:%.*]] = mul i32 [[DUMMY23_1]], [[DUMMY23_1]]
-; CHECK-NEXT:    [[DUMMY25_1:%.*]] = mul i32 [[DUMMY24_1]], [[DUMMY24_1]]
-; CHECK-NEXT:    [[DUMMY26_1:%.*]] = mul i32 [[DUMMY25_1]], [[DUMMY25_1]]
-; CHECK-NEXT:    [[DUMMY27_1:%.*]] = mul i32 [[DUMMY26_1]], [[DUMMY26_1]]
-; CHECK-NEXT:    [[DUMMY28_1:%.*]] = mul i32 [[DUMMY27_1]], [[DUMMY27_1]]
-; CHECK-NEXT:    [[DUMMY29_1:%.*]] = mul i32 [[DUMMY28_1]], [[DUMMY28_1]]
-; CHECK-NEXT:    [[DUMMY30_1:%.*]] = mul i32 [[DUMMY29_1]], [[DUMMY29_1]]
-; CHECK-NEXT:    [[DUMMY31_1:%.*]] = mul i32 [[DUMMY30_1]], [[DUMMY30_1]]
-; CHECK-NEXT:    [[DUMMY32_1:%.*]] = mul i32 [[DUMMY31_1]], [[DUMMY31_1]]
-; CHECK-NEXT:    [[DUMMY33_1:%.*]] = mul i32 [[DUMMY32_1]], [[DUMMY32_1]]
-; CHECK-NEXT:    [[DUMMY34_1:%.*]] = mul i32 [[DUMMY33_1]], [[DUMMY33_1]]
-; CHECK-NEXT:    [[DUMMY35_1:%.*]] = mul i32 [[DUMMY34_1]], [[DUMMY34_1]]
-; CHECK-NEXT:    [[DUMMY36_1:%.*]] = mul i32 [[DUMMY35_1]], [[DUMMY35_1]]
-; CHECK-NEXT:    [[DUMMY37_1:%.*]] = mul i32 [[DUMMY36_1]], [[DUMMY36_1]]
-; CHECK-NEXT:    [[DUMMY38_1:%.*]] = mul i32 [[DUMMY37_1]], [[DUMMY37_1]]
-; CHECK-NEXT:    [[DUMMY39_1:%.*]] = mul i32 [[DUMMY38_1]], [[DUMMY38_1]]
-; CHECK-NEXT:    [[DUMMY40_1:%.*]] = mul i32 [[DUMMY39_1]], [[DUMMY39_1]]
-; CHECK-NEXT:    [[DUMMY41_1:%.*]] = mul i32 [[DUMMY40_1]], [[DUMMY40_1]]
-; CHECK-NEXT:    [[DUMMY42_1:%.*]] = mul i32 [[DUMMY41_1]], [[DUMMY41_1]]
-; CHECK-NEXT:    [[DUMMY43_1:%.*]] = mul i32 [[DUMMY42_1]], [[DUMMY42_1]]
-; CHECK-NEXT:    [[DUMMY44_1:%.*]] = mul i32 [[DUMMY43_1]], [[DUMMY43_1]]
-; CHECK-NEXT:    [[DUMMY45_1:%.*]] = mul i32 [[DUMMY44_1]], [[DUMMY44_1]]
-; CHECK-NEXT:    [[DUMMY46_1:%.*]] = mul i32 [[DUMMY45_1]], [[DUMMY45_1]]
-; CHECK-NEXT:    [[DUMMY47_1:%.*]] = mul i32 [[DUMMY46_1]], [[DUMMY46_1]]
-; CHECK-NEXT:    [[DUMMY48_1:%.*]] = mul i32 [[DUMMY47_1]], [[DUMMY47_1]]
-; CHECK-NEXT:    [[DUMMY49_1:%.*]] = mul i32 [[DUMMY48_1]], [[DUMMY48_1]]
-; CHECK-NEXT:    [[DUMMY50_1:%.*]] = mul i32 [[DUMMY49_1]], [[DUMMY49_1]]
-; CHECK-NEXT:    [[SUM_NEXT_1]] = add nsw i32 [[DUMMY50_1]], [[SUM_NEXT]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
-; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
-; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label [[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       for.cond.cleanup.unr-lcssa.loopexit:
-; CHECK-NEXT:    [[SUM_NEXT_LCSSA_PH_PH:%.*]] = phi i32 [ [[SUM_NEXT_1]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[INDVARS_IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_UNR_PH:%.*]] = phi i32 [ [[SUM_NEXT_1]], [[FOR_BODY]] ]
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_UNR_LCSSA]]
-; CHECK:       for.cond.cleanup.unr-lcssa:
-; CHECK-NEXT:    [[SUM_NEXT_LCSSA_PH:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[SUM_NEXT_LCSSA_PH_PH]], [[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT]] ]
-; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_UNR_PH]], [[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT]] ]
-; CHECK-NEXT:    [[SUM_UNR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_UNR_PH]], [[FOR_COND_CLEANUP_UNR_LCSSA_LOOPEXIT]] ]
-; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
-; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[FOR_BODY_EPIL_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.epil.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY_EPIL:%.*]]
-; CHECK:       for.body.epil:
-; CHECK-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i32, ptr [[ARY]], i64 [[INDVARS_IV_UNR]]
-; CHECK-NEXT:    [[VAL_EPIL:%.*]] = load i32, ptr [[ARRAYIDX_EPIL]], align 4
-; CHECK-NEXT:    [[DUMMY1_EPIL:%.*]] = mul i32 [[VAL_EPIL]], [[VAL_EPIL]]
-; CHECK-NEXT:    [[DUMMY2_EPIL:%.*]] = mul i32 [[DUMMY1_EPIL]], [[DUMMY1_EPIL]]
-; CHECK-NEXT:    [[DUMMY3_EPIL:%.*]] = mul i32 [[DUMMY2_EPIL]], [[DUMMY2_EPIL]]
-; CHECK-NEXT:    [[DUMMY4_EPIL:%.*]] = mul i32 [[DUMMY3_EPIL]], [[DUMMY3_EPIL]]
-; CHECK-NEXT:    [[DUMMY5_EPIL:%.*]] = mul i32 [[DUMMY4_EPIL]], [[DUMMY4_EPIL]]
-; CHECK-NEXT:    [[DUMMY6_EPIL:%.*]] = mul i32 [[DUMMY5_EPIL]], [[DUMMY5_EPIL]]
-; CHECK-NEXT:    [[DUMMY7_EPIL:%.*]] = mul i32 [[DUMMY6_EPIL]], [[DUMMY6_EPIL]]
-; CHECK-NEXT:    [[DUMMY8_EPIL:%.*]] = mul i32 [[DUMMY7_EPIL]], [[DUMMY7_EPIL]]
-; CHECK-NEXT:    [[DUMMY9_EPIL:%.*]] = mul i32 [[DUMMY8_EPIL]], [[DUMMY8_EPIL]]
-; CHECK-NEXT:    [[DUMMY10_EPIL:%.*]] = mul i32 [[DUMMY9_EPIL]], [[DUMMY9_EPIL]]
-; CHECK-NEXT:    [[DUMMY11_EPIL:%.*]] = mul i32 [[DUMMY10_EPIL]], [[DUMMY10_EPIL]]
-; CHECK-NEXT:    [[DUMMY12_EPIL:%.*]] = mul i32 [[DUMMY11_EPIL]], [[DUMMY11_EPIL]]
-; CHECK-NEXT:    [[DUMMY13_EPIL:%.*]] = mul i32 [[DUMMY12_EPIL]], [[DUMMY12_EPIL]]
-; CHECK-NEXT:    [[DUMMY14_EPIL:%.*]] = mul i32 [[DUMMY13_EPIL]], [[DUMMY13_EPIL]]
-; CHECK-NEXT:    [[DUMMY15_EPIL:%.*]] = mul i32 [[DUMMY14_EPIL]], [[DUMMY14_EPIL]]
-; CHECK-NEXT:    [[DUMMY16_EPIL:%.*]] = mul i32 [[DUMMY15_EPIL]], [[DUMMY15_EPIL]]
-; CHECK-NEXT:    [[DUMMY17_EPIL:%.*]] = mul i32 [[DUMMY16_EPIL]], [[DUMMY16_EPIL]]
-; CHECK-NEXT:    [[DUMMY18_EPIL:%.*]] = mul i32 [[DUMMY17_EPIL]], [[DUMMY17_EPIL]]
-; CHECK-NEXT:    [[DUMMY19_EPIL:%.*]] = mul i32 [[DUMMY18_EPIL]], [[DUMMY18_EPIL]]
-; CHECK-NEXT:    [[DUMMY20_EPIL:%.*]] = mul i32 [[DUMMY19_EPIL]], [[DUMMY19_EPIL]]
-; CHECK-NEXT:    [[DUMMY21_EPIL:%.*]] = mul i32 [[DUMMY20_EPIL]], [[DUMMY20_EPIL]]
-; CHECK-NEXT:    [[DUMMY22_EPIL:%.*]] = mul i32 [[DUMMY21_EPIL]], [[DUMMY21_EPIL]]
-; CHECK-NEXT:    [[DUMMY23_EPIL:%.*]] = mul i32 [[DUMMY22_EPIL]], [[DUMMY22_EPIL]]
-; CHECK-NEXT:    [[DUMMY24_EPIL:%.*]] = mul i32 [[DUMMY23_EPIL]], [[DUMMY23_EPIL]]
-; CHECK-NEXT:    [[DUMMY25_EPIL:%.*]] = mul i32 [[DUMMY24_EPIL]], [[DUMMY24_EPIL]]
-; CHECK-NEXT:    [[DUMMY26_EPIL:%.*]] = mul i32 [[DUMMY25_EPIL]], [[DUMMY25_EPIL]]
-; CHECK-NEXT:    [[DUMMY27_EPIL:%.*]] = mul i32 [[DUMMY26_EPIL]], [[DUMMY26_EPIL]]
-; CHECK-NEXT:    [[DUMMY28_EPIL:%.*]] = mul i32 [[DUMMY27_EPIL]], [[DUMMY27_EPIL]]
-; CHECK-NEXT:    [[DUMMY29_EPIL:%.*]] = mul i32 [[DUMMY28_EPIL]], [[DUMMY28_EPIL]]
-; CHECK-NEXT:    [[DUMMY30_EPIL:%.*]] = mul i32 [[DUMMY29_EPIL]], [[DUMMY29_EPIL]]
-; CHECK-NEXT:    [[DUMMY31_EPIL:%.*]] = mul i32 [[DUMMY30_EPIL]], [[DUMMY30_EPIL]]
-; CHECK-NEXT:    [[DUMMY32_EPIL:%.*]] = mul i32 [[DUMMY31_EPIL]], [[DUMMY31_EPIL]]
-; CHECK-NEXT:    [[DUMMY33_EPIL:%.*]] = mul i32 [[DUMMY32_EPIL]], [[DUMMY32_EPIL]]
-; CHECK-NEXT:    [[DUMMY34_EPIL:%.*]] = mul i32 [[DUMMY33_EPIL]], [[DUMMY33_EPIL]]
-; CHECK-NEXT:    [[DUMMY35_EPIL:%.*]] = mul i32 [[DUMMY34_EPIL]], [[DUMMY34_EPIL]]
-; CHECK-NEXT:    [[DUMMY36_EPIL:%.*]] = mul i32 [[DUMMY35_EPIL]], [[DUMMY35_EPIL]]
-; CHECK-NEXT:    [[DUMMY37_EPIL:%.*]] = mul i32 [[DUMMY36_EPIL]], [[DUMMY36_EPIL]]
-; CHECK-NEXT:    [[DUMMY38_EPIL:%.*]] = mul i32 [[DUMMY37_EPIL]], [[DUMMY37_EPIL]]
-; CHECK-NEXT:    [[DUMMY39_EPIL:%.*]] = mul i32 [[DUMMY38_EPIL]], [[DUMMY38_EPIL]]
-; CHECK-NEXT:    [[DUMMY40_EPIL:%.*]] = mul i32 [[DUMMY39_EPIL]], [[DUMMY39_EPIL]]
-; CHECK-NEXT:    [[DUMMY41_EPIL:%.*]] = mul i32 [[DUMMY40_EPIL]], [[DUMMY40_EPIL]]
-; CHECK-NEXT:    [[DUMMY42_EPIL:%.*]] = mul i32 [[DUMMY41_EPIL]], [[DUMMY41_EPIL]]
-; CHECK-NEXT:    [[DUMMY43_EPIL:%.*]] = mul i32 [[DUMMY42_EPIL]], [[DUMMY42_EPIL]]
-; CHECK-NEXT:    [[DUMMY44_EPIL:%.*]] = mul i32 [[DUMMY43_EPIL]], [[DUMMY43_EPIL]]
-; CHECK-NEXT:    [[DUMMY45_EPIL:%.*]] = mul i32 [[DUMMY44_EPIL]], [[DUMMY44_EPIL]]
-; CHECK-NEXT:    [[DUMMY46_EPIL:%.*]] = mul i32 [[DUMMY45_EPIL]], [[DUMMY45_EPIL]]
-; CHECK-NEXT:    [[DUMMY47_EPIL:%.*]] = mul i32 [[DUMMY46_EPIL]], [[DUMMY46_EPIL]]
-; CHECK-NEXT:    [[DUMMY48_EPIL:%.*]] = mul i32 [[DUMMY47_EPIL]], [[DUMMY47_EPIL]]
-; CHECK-NEXT:    [[DUMMY49_EPIL:%.*]] = mul i32 [[DUMMY48_EPIL]], [[DUMMY48_EPIL]]
-; CHECK-NEXT:    [[DUMMY50_EPIL:%.*]] = mul i32 [[DUMMY49_EPIL]], [[DUMMY49_EPIL]]
-; CHECK-NEXT:    [[SUM_NEXT_EPIL:%.*]] = add nsw i32 [[DUMMY50_EPIL]], [[SUM_UNR]]
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK-NEXT:    [[DUMMY50_EPIL:%.*]] = mul i32 [[DUMMY49]], [[DUMMY49]]
+; CHECK-NEXT:    [[SUM_NEXT_EPIL]] = add nsw i32 [[DUMMY50_EPIL]], [[SUM_EPIL]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_EPIL]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_EPIL]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT_LCSSA_PH]], [[FOR_COND_CLEANUP_UNR_LCSSA]] ], [ [[SUM_NEXT_EPIL]], [[FOR_BODY_EPIL]] ]
+; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT_EPIL]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_NEXT_LCSSA]]
 ;
 entry:



More information about the llvm-commits mailing list