[llvm] 44c1a56 - [ARM] Add extra MVE tests for various patches. NFC
David Green via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 1 08:24:37 PST 2020
Author: David Green
Date: 2020-11-01T16:24:23Z
New Revision: 44c1a568695a40da8de41ed23699a367d8baecc4
URL: https://github.com/llvm/llvm-project/commit/44c1a568695a40da8de41ed23699a367d8baecc4
DIFF: https://github.com/llvm/llvm-project/commit/44c1a568695a40da8de41ed23699a367d8baecc4.diff
LOG: [ARM] Add extra MVE tests for various patches. NFC
Added:
llvm/test/CodeGen/Thumb2/mve-nounrolledremainder.ll
llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
Modified:
llvm/test/Transforms/HardwareLoops/scalar-while.ll
llvm/test/Transforms/LoopUnroll/ARM/mve-nounroll.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/Thumb2/mve-nounrolledremainder.ll b/llvm/test/CodeGen/Thumb2/mve-nounrolledremainder.ll
new file mode 100644
index 000000000000..ac144dd41484
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-nounrolledremainder.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp -o - %s | FileCheck --check-prefix=CHECK %s
+
+define void @tailpred(half* nocapture readonly %pSrcA, half* nocapture readonly %pSrcB, half* nocapture %pDst, i32 %blockSize) {
+; CHECK-LABEL: tailpred:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: cmp r3, #0
+; CHECK-NEXT: beq .LBB0_6
+; CHECK-NEXT: @ %bb.1: @ %vector.memcheck
+; CHECK-NEXT: add.w r4, r2, r3, lsl #1
+; CHECK-NEXT: add.w r5, r1, r3, lsl #1
+; CHECK-NEXT: cmp r4, r1
+; CHECK-NEXT: cset r12, hi
+; CHECK-NEXT: cmp r5, r2
+; CHECK-NEXT: cset lr, hi
+; CHECK-NEXT: cmp r4, r0
+; CHECK-NEXT: add.w r5, r0, r3, lsl #1
+; CHECK-NEXT: cset r4, hi
+; CHECK-NEXT: cmp r5, r2
+; CHECK-NEXT: cset r5, hi
+; CHECK-NEXT: ands r4, r5
+; CHECK-NEXT: lsls r4, r4, #31
+; CHECK-NEXT: mov r4, r3
+; CHECK-NEXT: itt eq
+; CHECK-NEXT: andeq.w r3, lr, r12
+; CHECK-NEXT: lslseq.w r3, r3, #31
+; CHECK-NEXT: beq .LBB0_4
+; CHECK-NEXT: @ %bb.2: @ %while.body.preheader
+; CHECK-NEXT: dls lr, r4
+; CHECK-NEXT: .LBB0_3: @ %while.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldr.16 s0, [r0]
+; CHECK-NEXT: vldr.16 s2, [r1]
+; CHECK-NEXT: adds r1, #2
+; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vadd.f16 s0, s2, s0
+; CHECK-NEXT: vstr.16 s0, [r2]
+; CHECK-NEXT: adds r2, #2
+; CHECK-NEXT: le lr, .LBB0_3
+; CHECK-NEXT: b .LBB0_6
+; CHECK-NEXT: .LBB0_4: @ %vector.ph
+; CHECK-NEXT: adds r3, r4, #7
+; CHECK-NEXT: movs r5, #1
+; CHECK-NEXT: bic r3, r3, #7
+; CHECK-NEXT: subs r3, #8
+; CHECK-NEXT: add.w r5, r5, r3, lsr #3
+; CHECK-NEXT: mov r3, r5
+; CHECK-NEXT: mov r5, r4
+; CHECK-NEXT: .LBB0_5: @ %vector.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: mov lr, r3
+; CHECK-NEXT: vctp.16 r5
+; CHECK-NEXT: sub.w lr, lr, #1
+; CHECK-NEXT: subs r5, #8
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vldrht.u16 q0, [r0], #16
+; CHECK-NEXT: vldrht.u16 q1, [r1], #16
+; CHECK-NEXT: mov r3, lr
+; CHECK-NEXT: vadd.f16 q0, q1, q0
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vstrht.16 q0, [r2], #16
+; CHECK-NEXT: cmp.w lr, #0
+; CHECK-NEXT: bne .LBB0_5
+; CHECK-NEXT: b .LBB0_6
+; CHECK-NEXT: .LBB0_6: @ %while.end
+; CHECK-NEXT: pop {r4, r5, r7, pc}
+entry:
+ %cmp.not6 = icmp eq i32 %blockSize, 0
+ br i1 %cmp.not6, label %while.end, label %vector.memcheck
+
+vector.memcheck: ; preds = %entry
+ %scevgep = getelementptr half, half* %pDst, i32 %blockSize
+ %scevgep14 = getelementptr half, half* %pSrcA, i32 %blockSize
+ %scevgep17 = getelementptr half, half* %pSrcB, i32 %blockSize
+ %bound0 = icmp ugt half* %scevgep14, %pDst
+ %bound1 = icmp ugt half* %scevgep, %pSrcA
+ %found.conflict = and i1 %bound0, %bound1
+ %bound019 = icmp ugt half* %scevgep17, %pDst
+ %bound120 = icmp ugt half* %scevgep, %pSrcB
+ %found.conflict21 = and i1 %bound019, %bound120
+ %conflict.rdx = or i1 %found.conflict, %found.conflict21
+ br i1 %conflict.rdx, label %while.body, label %vector.ph
+
+vector.ph: ; preds = %vector.memcheck
+ %n.rnd.up = add i32 %blockSize, 7
+ %n.vec = and i32 %n.rnd.up, -8
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %next.gep = getelementptr half, half* %pSrcA, i32 %index
+ %next.gep28 = getelementptr half, half* %pDst, i32 %index
+ %next.gep29 = getelementptr half, half* %pSrcB, i32 %index
+ %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %blockSize)
+ %0 = bitcast half* %next.gep to <8 x half>*
+ %wide.masked.load = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %active.lane.mask, <8 x half> undef)
+ %1 = bitcast half* %next.gep29 to <8 x half>*
+ %wide.masked.load32 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x half> undef)
+ %2 = fadd fast <8 x half> %wide.masked.load32, %wide.masked.load
+ %3 = bitcast half* %next.gep28 to <8 x half>*
+ call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %2, <8 x half>* %3, i32 2, <8 x i1> %active.lane.mask)
+ %index.next = add i32 %index, 8
+ %4 = icmp eq i32 %index.next, %n.vec
+ br i1 %4, label %while.end, label %vector.body
+
+while.body: ; preds = %vector.memcheck, %while.body
+ %blkCnt.010 = phi i32 [ %dec, %while.body ], [ %blockSize, %vector.memcheck ]
+ %pSrcA.addr.09 = phi half* [ %incdec.ptr, %while.body ], [ %pSrcA, %vector.memcheck ]
+ %pDst.addr.08 = phi half* [ %incdec.ptr3, %while.body ], [ %pDst, %vector.memcheck ]
+ %pSrcB.addr.07 = phi half* [ %incdec.ptr1, %while.body ], [ %pSrcB, %vector.memcheck ]
+ %incdec.ptr = getelementptr inbounds half, half* %pSrcA.addr.09, i32 1
+ %5 = load half, half* %pSrcA.addr.09, align 2
+ %incdec.ptr1 = getelementptr inbounds half, half* %pSrcB.addr.07, i32 1
+ %6 = load half, half* %pSrcB.addr.07, align 2
+ %7 = fadd fast half %6, %5
+ %incdec.ptr3 = getelementptr inbounds half, half* %pDst.addr.08, i32 1
+ store half %7, half* %pDst.addr.08, align 2
+ %dec = add i32 %blkCnt.010, -1
+ %cmp.not = icmp eq i32 %dec, 0
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.end: ; preds = %vector.body, %while.body, %entry
+ ret void
+}
+
+define void @notailpred(half* nocapture readonly %pSrcA, half* nocapture readonly %pSrcB, half* nocapture %pDst, i32 %blockSize) {
+; CHECK-LABEL: notailpred:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r6, r8, r9, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r8, r9, lr}
+; CHECK-NEXT: cbz r3, .LBB1_6
+; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
+; CHECK-NEXT: cmp r3, #8
+; CHECK-NEXT: blo .LBB1_3
+; CHECK-NEXT: @ %bb.2: @ %vector.memcheck
+; CHECK-NEXT: add.w r4, r2, r3, lsl #1
+; CHECK-NEXT: add.w r5, r1, r3, lsl #1
+; CHECK-NEXT: cmp r4, r1
+; CHECK-NEXT: add.w r6, r0, r3, lsl #1
+; CHECK-NEXT: cset r12, hi
+; CHECK-NEXT: cmp r5, r2
+; CHECK-NEXT: cset r5, hi
+; CHECK-NEXT: cmp r4, r0
+; CHECK-NEXT: cset r4, hi
+; CHECK-NEXT: cmp r6, r2
+; CHECK-NEXT: cset r6, hi
+; CHECK-NEXT: ands r6, r4
+; CHECK-NEXT: lsls r6, r6, #31
+; CHECK-NEXT: itt eq
+; CHECK-NEXT: andeq.w r6, r5, r12
+; CHECK-NEXT: lslseq.w r6, r6, #31
+; CHECK-NEXT: beq .LBB1_7
+; CHECK-NEXT: .LBB1_3:
+; CHECK-NEXT: mov lr, r3
+; CHECK-NEXT: mov r12, r0
+; CHECK-NEXT: mov r4, r2
+; CHECK-NEXT: mov r5, r1
+; CHECK-NEXT: .LBB1_4: @ %while.body.preheader31
+; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: .LBB1_5: @ %while.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldr.16 s0, [r12]
+; CHECK-NEXT: vldr.16 s2, [r5]
+; CHECK-NEXT: adds r5, #2
+; CHECK-NEXT: add.w r12, r12, #2
+; CHECK-NEXT: vadd.f16 s0, s2, s0
+; CHECK-NEXT: vstr.16 s0, [r4]
+; CHECK-NEXT: adds r4, #2
+; CHECK-NEXT: le lr, .LBB1_5
+; CHECK-NEXT: .LBB1_6: @ %while.end
+; CHECK-NEXT: pop.w {r4, r5, r6, r8, r9, pc}
+; CHECK-NEXT: .LBB1_7: @ %vector.ph
+; CHECK-NEXT: bic r8, r3, #7
+; CHECK-NEXT: movs r4, #1
+; CHECK-NEXT: sub.w r5, r8, #8
+; CHECK-NEXT: and r9, r3, #7
+; CHECK-NEXT: add.w r12, r0, r8, lsl #1
+; CHECK-NEXT: add.w r5, r4, r5, lsr #3
+; CHECK-NEXT: add.w r4, r2, r8, lsl #1
+; CHECK-NEXT: mov r6, r5
+; CHECK-NEXT: add.w r5, r1, r8, lsl #1
+; CHECK-NEXT: .LBB1_8: @ %vector.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrh.u16 q0, [r0], #16
+; CHECK-NEXT: vldrh.u16 q1, [r1], #16
+; CHECK-NEXT: mov lr, r6
+; CHECK-NEXT: vadd.f16 q0, q1, q0
+; CHECK-NEXT: subs.w lr, lr, #1
+; CHECK-NEXT: vstrb.8 q0, [r2], #16
+; CHECK-NEXT: mov r6, lr
+; CHECK-NEXT: bne .LBB1_8
+; CHECK-NEXT: b .LBB1_9
+; CHECK-NEXT: .LBB1_9: @ %middle.block
+; CHECK-NEXT: cmp r8, r3
+; CHECK-NEXT: mov lr, r9
+; CHECK-NEXT: bne .LBB1_4
+; CHECK-NEXT: b .LBB1_6
+entry:
+ %cmp.not6 = icmp eq i32 %blockSize, 0
+ br i1 %cmp.not6, label %while.end, label %while.body.preheader
+
+while.body.preheader: ; preds = %entry
+ %min.iters.check = icmp ult i32 %blockSize, 8
+ br i1 %min.iters.check, label %while.body.preheader31, label %vector.memcheck
+
+vector.memcheck: ; preds = %while.body.preheader
+ %scevgep = getelementptr half, half* %pDst, i32 %blockSize
+ %scevgep14 = getelementptr half, half* %pSrcA, i32 %blockSize
+ %scevgep17 = getelementptr half, half* %pSrcB, i32 %blockSize
+ %bound0 = icmp ugt half* %scevgep14, %pDst
+ %bound1 = icmp ugt half* %scevgep, %pSrcA
+ %found.conflict = and i1 %bound0, %bound1
+ %bound019 = icmp ugt half* %scevgep17, %pDst
+ %bound120 = icmp ugt half* %scevgep, %pSrcB
+ %found.conflict21 = and i1 %bound019, %bound120
+ %conflict.rdx = or i1 %found.conflict, %found.conflict21
+ br i1 %conflict.rdx, label %while.body.preheader31, label %vector.ph
+
+vector.ph: ; preds = %vector.memcheck
+ %n.vec = and i32 %blockSize, -8
+ %ind.end = and i32 %blockSize, 7
+ %ind.end23 = getelementptr half, half* %pSrcA, i32 %n.vec
+ %ind.end25 = getelementptr half, half* %pDst, i32 %n.vec
+ %ind.end27 = getelementptr half, half* %pSrcB, i32 %n.vec
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %next.gep = getelementptr half, half* %pSrcA, i32 %index
+ %next.gep28 = getelementptr half, half* %pDst, i32 %index
+ %next.gep29 = getelementptr half, half* %pSrcB, i32 %index
+ %0 = bitcast half* %next.gep to <8 x half>*
+ %wide.load = load <8 x half>, <8 x half>* %0, align 2
+ %1 = bitcast half* %next.gep29 to <8 x half>*
+ %wide.load30 = load <8 x half>, <8 x half>* %1, align 2
+ %2 = fadd fast <8 x half> %wide.load30, %wide.load
+ %3 = bitcast half* %next.gep28 to <8 x half>*
+ store <8 x half> %2, <8 x half>* %3, align 2
+ %index.next = add i32 %index, 8
+ %4 = icmp eq i32 %index.next, %n.vec
+ br i1 %4, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %cmp.n = icmp eq i32 %n.vec, %blockSize
+ br i1 %cmp.n, label %while.end, label %while.body.preheader31
+
+while.body.preheader31: ; preds = %middle.block, %vector.memcheck, %while.body.preheader
+ %blkCnt.010.ph = phi i32 [ %blockSize, %vector.memcheck ], [ %blockSize, %while.body.preheader ], [ %ind.end, %middle.block ]
+ %pSrcA.addr.09.ph = phi half* [ %pSrcA, %vector.memcheck ], [ %pSrcA, %while.body.preheader ], [ %ind.end23, %middle.block ]
+ %pDst.addr.08.ph = phi half* [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end25, %middle.block ]
+ %pSrcB.addr.07.ph = phi half* [ %pSrcB, %vector.memcheck ], [ %pSrcB, %while.body.preheader ], [ %ind.end27, %middle.block ]
+ br label %while.body
+
+while.body: ; preds = %while.body.preheader31, %while.body
+ %blkCnt.010 = phi i32 [ %dec, %while.body ], [ %blkCnt.010.ph, %while.body.preheader31 ]
+ %pSrcA.addr.09 = phi half* [ %incdec.ptr, %while.body ], [ %pSrcA.addr.09.ph, %while.body.preheader31 ]
+ %pDst.addr.08 = phi half* [ %incdec.ptr3, %while.body ], [ %pDst.addr.08.ph, %while.body.preheader31 ]
+ %pSrcB.addr.07 = phi half* [ %incdec.ptr1, %while.body ], [ %pSrcB.addr.07.ph, %while.body.preheader31 ]
+ %incdec.ptr = getelementptr inbounds half, half* %pSrcA.addr.09, i32 1
+ %5 = load half, half* %pSrcA.addr.09, align 2
+ %incdec.ptr1 = getelementptr inbounds half, half* %pSrcB.addr.07, i32 1
+ %6 = load half, half* %pSrcB.addr.07, align 2
+ %7 = fadd fast half %6, %5
+ %incdec.ptr3 = getelementptr inbounds half, half* %pDst.addr.08, i32 1
+ store half %7, half* %pDst.addr.08, align 2
+ %dec = add i32 %blkCnt.010, -1
+ %cmp.not = icmp eq i32 %dec, 0
+ br i1 %cmp.not, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %middle.block, %entry
+ ret void
+}
+
+declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1
+declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32 immarg, <8 x i1>, <8 x half>) #2
+declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32 immarg, <8 x i1>) #3
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
new file mode 100644
index 000000000000..d2819034c44b
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
@@ -0,0 +1,1674 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
+
+%struct.DCT_InstanceTypeDef = type { float*, i32, i32 }
+
+define void @DCT_mve1(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
+; CHECK-LABEL: DCT_mve1:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT: ldr r3, [r0, #4]
+; CHECK-NEXT: sub.w r12, r3, #1
+; CHECK-NEXT: cmp.w r12, #2
+; CHECK-NEXT: blo .LBB0_5
+; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT: ldr.w r9, [r0, #8]
+; CHECK-NEXT: ldr r3, [r0]
+; CHECK-NEXT: add.w r3, r3, r9, lsl #2
+; CHECK-NEXT: movs r0, #1
+; CHECK-NEXT: lsl.w r8, r9, #2
+; CHECK-NEXT: .LBB0_2: @ %for.body
+; CHECK-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NEXT: @ Child Loop BB0_3 Depth 2
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: mov r6, r1
+; CHECK-NEXT: mov r7, r3
+; CHECK-NEXT: mov r5, r9
+; CHECK-NEXT: dlstp.32 lr, r5
+; CHECK-NEXT: .LBB0_3: @ %vector.body
+; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1
+; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT: vldrw.u32 q1, [r6], #16
+; CHECK-NEXT: vldrw.u32 q2, [r7], #16
+; CHECK-NEXT: vfma.f32 q0, q2, q1
+; CHECK-NEXT: letp lr, .LBB0_3
+; CHECK-NEXT: @ %bb.4: @ %middle.block
+; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT: vadd.f32 s4, s2, s3
+; CHECK-NEXT: add.w r7, r2, r0, lsl #2
+; CHECK-NEXT: vadd.f32 s0, s0, s1
+; CHECK-NEXT: adds r0, #1
+; CHECK-NEXT: add r3, r8
+; CHECK-NEXT: cmp r0, r12
+; CHECK-NEXT: vadd.f32 s0, s0, s4
+; CHECK-NEXT: vstr s0, [r7]
+; CHECK-NEXT: bne .LBB0_2
+; CHECK-NEXT: .LBB0_5: @ %for.cond.cleanup
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
+entry:
+ %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
+ %0 = load i32, i32* %NumInputs, align 4
+ %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
+ %1 = load i32, i32* %NumFilters, align 4
+ %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
+ %2 = load float*, float** %pDCTCoefs, align 4
+ %cmp = icmp ugt i32 %0, 1
+ tail call void @llvm.assume(i1 %cmp)
+ %sub = add i32 %1, -1
+ %cmp350 = icmp ugt i32 %sub, 1
+ br i1 %cmp350, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %n.rnd.up = add i32 %0, 3
+ %n.vec = and i32 %n.rnd.up, -4
+ br label %for.body
+
+for.cond.cleanup: ; preds = %middle.block, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %middle.block
+ %k2.051 = phi i32 [ %add16, %middle.block ], [ 1, %for.body.preheader ]
+ %mul4 = mul i32 %k2.051, %0
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %for.body
+ %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
+ %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %10, %vector.body ]
+ %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
+ %3 = getelementptr inbounds float, float* %pIn, i32 %index
+ %4 = bitcast float* %3 to <4 x float>*
+ %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %5 = add i32 %index, %mul4
+ %6 = getelementptr inbounds float, float* %2, i32 %5
+ %7 = bitcast float* %6 to <4 x float>*
+ %wide.masked.load53 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %8 = fmul fast <4 x float> %wide.masked.load53, %wide.masked.load
+ %9 = fadd fast <4 x float> %8, %vec.phi
+ %10 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi
+ %index.next = add i32 %index, 4
+ %11 = icmp eq i32 %index.next, %n.vec
+ br i1 %11, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %12 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %10)
+ %arrayidx14 = getelementptr inbounds float, float* %pOut, i32 %k2.051
+ store float %12, float* %arrayidx14, align 4
+ %add16 = add nuw i32 %k2.051, 1
+ %exitcond52.not = icmp eq i32 %add16, %sub
+ br i1 %exitcond52.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @DCT_mve2(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
+; CHECK-LABEL: DCT_mve2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: .pad #4
+; CHECK-NEXT: sub sp, #4
+; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
+; CHECK-NEXT: ldr r1, [r0, #4]
+; CHECK-NEXT: subs r1, #2
+; CHECK-NEXT: cmp r1, #2
+; CHECK-NEXT: blo .LBB1_5
+; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT: ldr.w r12, [r0, #8]
+; CHECK-NEXT: movs r4, #1
+; CHECK-NEXT: ldr r3, [r0]
+; CHECK-NEXT: add.w r5, r3, r12, lsl #2
+; CHECK-NEXT: add.w r7, r3, r12, lsl #3
+; CHECK-NEXT: lsl.w r8, r12, #3
+; CHECK-NEXT: .LBB1_2: @ %for.body
+; CHECK-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NEXT: @ Child Loop BB1_3 Depth 2
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: ldr r6, [sp] @ 4-byte Reload
+; CHECK-NEXT: add.w r10, r4, #1
+; CHECK-NEXT: mov r3, r5
+; CHECK-NEXT: mov r0, r7
+; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: dlstp.32 lr, r12
+; CHECK-NEXT: mov r11, r12
+; CHECK-NEXT: .LBB1_3: @ %vector.body
+; CHECK-NEXT: @ Parent Loop BB1_2 Depth=1
+; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT: vldrw.u32 q2, [r6], #16
+; CHECK-NEXT: vldrw.u32 q3, [r3], #16
+; CHECK-NEXT: vfma.f32 q1, q3, q2
+; CHECK-NEXT: vldrw.u32 q3, [r0], #16
+; CHECK-NEXT: vfma.f32 q0, q3, q2
+; CHECK-NEXT: letp lr, .LBB1_3
+; CHECK-NEXT: @ %bb.4: @ %middle.block
+; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT: vadd.f32 s8, s2, s3
+; CHECK-NEXT: add.w r0, r2, r10, lsl #2
+; CHECK-NEXT: vadd.f32 s0, s0, s1
+; CHECK-NEXT: add r5, r8
+; CHECK-NEXT: vadd.f32 s2, s6, s7
+; CHECK-NEXT: add r7, r8
+; CHECK-NEXT: vadd.f32 s4, s4, s5
+; CHECK-NEXT: vadd.f32 s0, s0, s8
+; CHECK-NEXT: vadd.f32 s2, s4, s2
+; CHECK-NEXT: vstr s0, [r0]
+; CHECK-NEXT: add.w r0, r2, r4, lsl #2
+; CHECK-NEXT: adds r4, #2
+; CHECK-NEXT: cmp r4, r1
+; CHECK-NEXT: vstr s2, [r0]
+; CHECK-NEXT: blo .LBB1_2
+; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup
+; CHECK-NEXT: add sp, #4
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+entry:
+ %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
+ %0 = load i32, i32* %NumInputs, align 4
+ %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
+ %1 = load i32, i32* %NumFilters, align 4
+ %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
+ %2 = load float*, float** %pDCTCoefs, align 4
+ %cmp = icmp ugt i32 %0, 1
+ tail call void @llvm.assume(i1 %cmp)
+ %sub = add i32 %1, -2
+ %cmp371 = icmp ugt i32 %sub, 1
+ br i1 %cmp371, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %n.rnd.up = add i32 %0, 3
+ %n.vec = and i32 %n.rnd.up, -4
+ br label %for.body
+
+for.cond.cleanup: ; preds = %middle.block, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %middle.block
+ %k2.072 = phi i32 [ %add25, %middle.block ], [ 1, %for.body.preheader ]
+ %mul4 = mul i32 %k2.072, %0
+ %add = add nuw i32 %k2.072, 1
+ %mul5 = mul i32 %add, %0
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %for.body
+ %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
+ %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %15, %vector.body ]
+ %vec.phi73 = phi <4 x float> [ zeroinitializer, %for.body ], [ %16, %vector.body ]
+ %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
+ %3 = getelementptr inbounds float, float* %pIn, i32 %index
+ %4 = bitcast float* %3 to <4 x float>*
+ %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %5 = add i32 %index, %mul4
+ %6 = getelementptr inbounds float, float* %2, i32 %5
+ %7 = bitcast float* %6 to <4 x float>*
+ %wide.masked.load74 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %8 = fmul fast <4 x float> %wide.masked.load74, %wide.masked.load
+ %9 = fadd fast <4 x float> %8, %vec.phi73
+ %10 = add i32 %index, %mul5
+ %11 = getelementptr inbounds float, float* %2, i32 %10
+ %12 = bitcast float* %11 to <4 x float>*
+ %wide.masked.load75 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %13 = fmul fast <4 x float> %wide.masked.load75, %wide.masked.load
+ %14 = fadd fast <4 x float> %13, %vec.phi
+ %15 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi
+ %16 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi73
+ %index.next = add i32 %index, 4
+ %17 = icmp eq i32 %index.next, %n.vec
+ br i1 %17, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %18 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %16)
+ %19 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %15)
+ %arrayidx21 = getelementptr inbounds float, float* %pOut, i32 %k2.072
+ store float %18, float* %arrayidx21, align 4
+ %arrayidx23 = getelementptr inbounds float, float* %pOut, i32 %add
+ store float %19, float* %arrayidx23, align 4
+ %add25 = add i32 %k2.072, 2
+ %cmp3 = icmp ult i32 %add25, %sub
+ br i1 %cmp3, label %for.body, label %for.cond.cleanup
+}
+
+define void @DCT_mve3(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
+; CHECK-LABEL: DCT_mve3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: .pad #4
+; CHECK-NEXT: sub sp, #4
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: ldr r1, [r0, #4]
+; CHECK-NEXT: subs r1, #3
+; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: cmp r1, #2
+; CHECK-NEXT: blo .LBB2_5
+; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT: ldr r3, [r0, #8]
+; CHECK-NEXT: movs r5, #1
+; CHECK-NEXT: ldr r1, [r0]
+; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: add.w r0, r3, r3, lsl #1
+; CHECK-NEXT: add.w r7, r1, r3, lsl #2
+; CHECK-NEXT: add.w r12, r1, r3, lsl #3
+; CHECK-NEXT: adds r3, #3
+; CHECK-NEXT: bic r3, r3, #3
+; CHECK-NEXT: add.w r1, r1, r0, lsl #2
+; CHECK-NEXT: subs r3, #4
+; CHECK-NEXT: lsl.w r11, r0, #2
+; CHECK-NEXT: add.w r3, r5, r3, lsr #2
+; CHECK-NEXT: str r3, [sp] @ 4-byte Spill
+; CHECK-NEXT: .LBB2_2: @ %for.body
+; CHECK-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NEXT: @ Child Loop BB2_3 Depth 2
+; CHECK-NEXT: ldr.w lr, [sp] @ 4-byte Reload
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: add.w r9, r5, #2
+; CHECK-NEXT: add.w r10, r5, #1
+; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: mov r3, r7
+; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: mov r0, r12
+; CHECK-NEXT: ldr.w r8, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: mov r4, r1
+; CHECK-NEXT: vmov q2, q0
+; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: .LBB2_3: @ %vector.body
+; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1
+; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT: vctp.32 r8
+; CHECK-NEXT: sub.w r8, r8, #4
+; CHECK-NEXT: vpstttt
+; CHECK-NEXT: vldrwt.u32 q3, [r6], #16
+; CHECK-NEXT: vldrwt.u32 q4, [r3], #16
+; CHECK-NEXT: vfmat.f32 q1, q4, q3
+; CHECK-NEXT: vldrwt.u32 q4, [r0], #16
+; CHECK-NEXT: vpsttt
+; CHECK-NEXT: vfmat.f32 q2, q4, q3
+; CHECK-NEXT: vldrwt.u32 q4, [r4], #16
+; CHECK-NEXT: vfmat.f32 q0, q4, q3
+; CHECK-NEXT: le lr, .LBB2_3
+; CHECK-NEXT: @ %bb.4: @ %middle.block
+; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT: vadd.f32 s12, s10, s11
+; CHECK-NEXT: add.w r0, r2, r10, lsl #2
+; CHECK-NEXT: vadd.f32 s8, s8, s9
+; CHECK-NEXT: add r7, r11
+; CHECK-NEXT: vadd.f32 s10, s6, s7
+; CHECK-NEXT: add r12, r11
+; CHECK-NEXT: vadd.f32 s4, s4, s5
+; CHECK-NEXT: add r1, r11
+; CHECK-NEXT: vadd.f32 s6, s2, s3
+; CHECK-NEXT: vadd.f32 s0, s0, s1
+; CHECK-NEXT: vadd.f32 s2, s8, s12
+; CHECK-NEXT: vadd.f32 s4, s4, s10
+; CHECK-NEXT: vadd.f32 s0, s0, s6
+; CHECK-NEXT: vstr s2, [r0]
+; CHECK-NEXT: add.w r0, r2, r5, lsl #2
+; CHECK-NEXT: adds r5, #3
+; CHECK-NEXT: vstr s4, [r0]
+; CHECK-NEXT: add.w r0, r2, r9, lsl #2
+; CHECK-NEXT: vstr s0, [r0]
+; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: cmp r5, r0
+; CHECK-NEXT: blo .LBB2_2
+; CHECK-NEXT: .LBB2_5: @ %for.cond.cleanup
+; CHECK-NEXT: add sp, #16
+; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: add sp, #4
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+entry:
+ %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
+ %0 = load i32, i32* %NumInputs, align 4
+ %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
+ %1 = load i32, i32* %NumFilters, align 4
+ %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
+ %2 = load float*, float** %pDCTCoefs, align 4
+ %cmp = icmp ugt i32 %0, 1
+ tail call void @llvm.assume(i1 %cmp)
+ %sub = add i32 %1, -3
+ %cmp392 = icmp ugt i32 %sub, 1
+ br i1 %cmp392, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %n.rnd.up = add i32 %0, 3
+ %n.vec = and i32 %n.rnd.up, -4
+ br label %for.body
+
+for.cond.cleanup: ; preds = %middle.block, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %middle.block
+ %k2.093 = phi i32 [ %add34, %middle.block ], [ 1, %for.body.preheader ]
+ %mul4 = mul i32 %k2.093, %0
+ %add = add nuw i32 %k2.093, 1
+ %mul5 = mul i32 %add, %0
+ %add6 = add i32 %k2.093, 2
+ %mul7 = mul i32 %add6, %0
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %for.body
+ %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
+ %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %20, %vector.body ]
+ %vec.phi94 = phi <4 x float> [ zeroinitializer, %for.body ], [ %21, %vector.body ]
+ %vec.phi95 = phi <4 x float> [ zeroinitializer, %for.body ], [ %22, %vector.body ]
+ %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
+ %3 = getelementptr inbounds float, float* %pIn, i32 %index
+ %4 = bitcast float* %3 to <4 x float>*
+ %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %5 = add i32 %index, %mul4
+ %6 = getelementptr inbounds float, float* %2, i32 %5
+ %7 = bitcast float* %6 to <4 x float>*
+ %wide.masked.load96 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %8 = fmul fast <4 x float> %wide.masked.load96, %wide.masked.load
+ %9 = fadd fast <4 x float> %8, %vec.phi95
+ %10 = add i32 %index, %mul5
+ %11 = getelementptr inbounds float, float* %2, i32 %10
+ %12 = bitcast float* %11 to <4 x float>*
+ %wide.masked.load97 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %13 = fmul fast <4 x float> %wide.masked.load97, %wide.masked.load
+ %14 = fadd fast <4 x float> %13, %vec.phi94
+ %15 = add i32 %index, %mul7
+ %16 = getelementptr inbounds float, float* %2, i32 %15
+ %17 = bitcast float* %16 to <4 x float>*
+ %wide.masked.load98 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %18 = fmul fast <4 x float> %wide.masked.load98, %wide.masked.load
+ %19 = fadd fast <4 x float> %18, %vec.phi
+ %20 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi
+ %21 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi94
+ %22 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi95
+ %index.next = add i32 %index, 4
+ %23 = icmp eq i32 %index.next, %n.vec
+ br i1 %23, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %24 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %22)
+ %25 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %21)
+ %26 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %20)
+ %arrayidx28 = getelementptr inbounds float, float* %pOut, i32 %k2.093
+ store float %24, float* %arrayidx28, align 4
+ %arrayidx30 = getelementptr inbounds float, float* %pOut, i32 %add
+ store float %25, float* %arrayidx30, align 4
+ %arrayidx32 = getelementptr inbounds float, float* %pOut, i32 %add6
+ store float %26, float* %arrayidx32, align 4
+ %add34 = add i32 %k2.093, 3
+ %cmp3 = icmp ult i32 %add34, %sub
+ br i1 %cmp3, label %for.body, label %for.cond.cleanup
+}
+
+define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
+; CHECK-LABEL: DCT_mve4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: .pad #4
+; CHECK-NEXT: sub sp, #4
+; CHECK-NEXT: .vsave {d8, d9, d10, d11}
+; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: .pad #32
+; CHECK-NEXT: sub sp, #32
+; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: ldr r1, [r0, #4]
+; CHECK-NEXT: subs r1, #4
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: cmp r1, #2
+; CHECK-NEXT: blo.w .LBB3_5
+; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT: ldr r3, [r0, #8]
+; CHECK-NEXT: movs r6, #1
+; CHECK-NEXT: ldr r1, [r0]
+; CHECK-NEXT: add.w r0, r3, r3, lsl #1
+; CHECK-NEXT: add.w r12, r1, r3, lsl #2
+; CHECK-NEXT: add.w r10, r1, r3, lsl #3
+; CHECK-NEXT: add.w r9, r1, r3, lsl #4
+; CHECK-NEXT: add.w r8, r1, r0, lsl #2
+; CHECK-NEXT: adds r0, r3, #3
+; CHECK-NEXT: bic r0, r0, #3
+; CHECK-NEXT: lsls r7, r3, #4
+; CHECK-NEXT: subs r0, #4
+; CHECK-NEXT: add.w r0, r6, r0, lsr #2
+; CHECK-NEXT: strd r0, r3, [sp, #4] @ 8-byte Folded Spill
+; CHECK-NEXT: .LBB3_2: @ %for.body
+; CHECK-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NEXT: @ Child Loop BB3_3 Depth 2
+; CHECK-NEXT: ldr.w lr, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: adds r0, r6, #3
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: mov r3, r12
+; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT: adds r0, r6, #2
+; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: ldr.w r11, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: mov r5, r8
+; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: adds r0, r6, #1
+; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: mov r0, r10
+; CHECK-NEXT: mov r4, r9
+; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: vmov q2, q0
+; CHECK-NEXT: vmov q3, q0
+; CHECK-NEXT: .LBB3_3: @ %vector.body
+; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1
+; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT: vctp.32 r11
+; CHECK-NEXT: sub.w r11, r11, #4
+; CHECK-NEXT: vpstttt
+; CHECK-NEXT: vldrwt.u32 q4, [r1], #16
+; CHECK-NEXT: vldrwt.u32 q5, [r0], #16
+; CHECK-NEXT: vfmat.f32 q3, q5, q4
+; CHECK-NEXT: vldrwt.u32 q5, [r3], #16
+; CHECK-NEXT: vpstttt
+; CHECK-NEXT: vfmat.f32 q2, q5, q4
+; CHECK-NEXT: vldrwt.u32 q5, [r5], #16
+; CHECK-NEXT: vfmat.f32 q1, q5, q4
+; CHECK-NEXT: vldrwt.u32 q5, [r4], #16
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vfmat.f32 q0, q5, q4
+; CHECK-NEXT: le lr, .LBB3_3
+; CHECK-NEXT: @ %bb.4: @ %middle.block
+; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1
+; CHECK-NEXT: vadd.f32 s16, s14, s15
+; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: vadd.f32 s12, s12, s13
+; CHECK-NEXT: add r12, r7
+; CHECK-NEXT: vadd.f32 s14, s10, s11
+; CHECK-NEXT: add r10, r7
+; CHECK-NEXT: vadd.f32 s8, s8, s9
+; CHECK-NEXT: add.w r0, r2, r0, lsl #2
+; CHECK-NEXT: vadd.f32 s10, s6, s7
+; CHECK-NEXT: add r8, r7
+; CHECK-NEXT: vadd.f32 s4, s4, s5
+; CHECK-NEXT: add r9, r7
+; CHECK-NEXT: vadd.f32 s6, s2, s3
+; CHECK-NEXT: vadd.f32 s0, s0, s1
+; CHECK-NEXT: vadd.f32 s2, s12, s16
+; CHECK-NEXT: vadd.f32 s8, s8, s14
+; CHECK-NEXT: vadd.f32 s4, s4, s10
+; CHECK-NEXT: vadd.f32 s0, s0, s6
+; CHECK-NEXT: vstr s2, [r0]
+; CHECK-NEXT: add.w r0, r2, r6, lsl #2
+; CHECK-NEXT: adds r6, #4
+; CHECK-NEXT: vstr s8, [r0]
+; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: add.w r0, r2, r0, lsl #2
+; CHECK-NEXT: vstr s4, [r0]
+; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: add.w r0, r2, r0, lsl #2
+; CHECK-NEXT: vstr s0, [r0]
+; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: cmp r6, r0
+; CHECK-NEXT: blo .LBB3_2
+; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup
+; CHECK-NEXT: add sp, #32
+; CHECK-NEXT: vpop {d8, d9, d10, d11}
+; CHECK-NEXT: add sp, #4
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+entry:
+ %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
+ %0 = load i32, i32* %NumInputs, align 4
+ %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
+ %1 = load i32, i32* %NumFilters, align 4
+ %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
+ %2 = load float*, float** %pDCTCoefs, align 4
+ %cmp = icmp ugt i32 %0, 1
+ tail call void @llvm.assume(i1 %cmp)
+ %sub = add i32 %1, -4
+ %cmp3113 = icmp ugt i32 %sub, 1
+ br i1 %cmp3113, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %n.rnd.up = add i32 %0, 3
+ %n.vec = and i32 %n.rnd.up, -4
+ br label %for.body
+
+for.cond.cleanup: ; preds = %middle.block, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %middle.block
+ %k2.0114 = phi i32 [ %add43, %middle.block ], [ 1, %for.body.preheader ]
+ %mul4 = mul i32 %k2.0114, %0
+ %add = add nuw nsw i32 %k2.0114, 1
+ %mul5 = mul i32 %add, %0
+ %add6 = add nuw nsw i32 %k2.0114, 2
+ %mul7 = mul i32 %add6, %0
+ %add8 = add i32 %k2.0114, 3
+ %mul9 = mul i32 %add8, %0
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %for.body
+ %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
+ %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %25, %vector.body ]
+ %vec.phi115 = phi <4 x float> [ zeroinitializer, %for.body ], [ %26, %vector.body ]
+ %vec.phi116 = phi <4 x float> [ zeroinitializer, %for.body ], [ %27, %vector.body ]
+ %vec.phi117 = phi <4 x float> [ zeroinitializer, %for.body ], [ %28, %vector.body ]
+ %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
+ %3 = getelementptr inbounds float, float* %pIn, i32 %index
+ %4 = bitcast float* %3 to <4 x float>*
+ %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %5 = add i32 %index, %mul4
+ %6 = getelementptr inbounds float, float* %2, i32 %5
+ %7 = bitcast float* %6 to <4 x float>*
+ %wide.masked.load118 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %8 = fmul fast <4 x float> %wide.masked.load118, %wide.masked.load
+ %9 = fadd fast <4 x float> %8, %vec.phi116
+ %10 = add i32 %index, %mul5
+ %11 = getelementptr inbounds float, float* %2, i32 %10
+ %12 = bitcast float* %11 to <4 x float>*
+ %wide.masked.load119 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %13 = fmul fast <4 x float> %wide.masked.load119, %wide.masked.load
+ %14 = fadd fast <4 x float> %13, %vec.phi117
+ %15 = add i32 %index, %mul7
+ %16 = getelementptr inbounds float, float* %2, i32 %15
+ %17 = bitcast float* %16 to <4 x float>*
+ %wide.masked.load120 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %18 = fmul fast <4 x float> %wide.masked.load120, %wide.masked.load
+ %19 = fadd fast <4 x float> %18, %vec.phi115
+ %20 = add i32 %index, %mul9
+ %21 = getelementptr inbounds float, float* %2, i32 %20
+ %22 = bitcast float* %21 to <4 x float>*
+ %wide.masked.load121 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %23 = fmul fast <4 x float> %wide.masked.load121, %wide.masked.load
+ %24 = fadd fast <4 x float> %23, %vec.phi
+ %25 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi
+ %26 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi115
+ %27 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi116
+ %28 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi117
+ %index.next = add i32 %index, 4
+ %29 = icmp eq i32 %index.next, %n.vec
+ br i1 %29, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %30 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %28)
+ %31 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %27)
+ %32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %26)
+ %33 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %25)
+ %arrayidx35 = getelementptr inbounds float, float* %pOut, i32 %k2.0114
+ store float %31, float* %arrayidx35, align 4
+ %arrayidx37 = getelementptr inbounds float, float* %pOut, i32 %add
+ store float %30, float* %arrayidx37, align 4
+ %arrayidx39 = getelementptr inbounds float, float* %pOut, i32 %add6
+ store float %32, float* %arrayidx39, align 4
+ %arrayidx41 = getelementptr inbounds float, float* %pOut, i32 %add8
+ store float %33, float* %arrayidx41, align 4
+ %add43 = add i32 %k2.0114, 4
+ %cmp3 = icmp ult i32 %add43, %sub
+ br i1 %cmp3, label %for.body, label %for.cond.cleanup
+}
+
+define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
+; CHECK-LABEL: DCT_mve5:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: .pad #4
+; CHECK-NEXT: sub sp, #4
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: .pad #32
+; CHECK-NEXT: sub sp, #32
+; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: ldr r1, [r0, #4]
+; CHECK-NEXT: subs r1, #5
+; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: cmp r1, #2
+; CHECK-NEXT: blo.w .LBB4_5
+; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT: ldr r3, [r0, #8]
+; CHECK-NEXT: ldr r1, [r0]
+; CHECK-NEXT: adds r0, r3, #3
+; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: bic r0, r0, #3
+; CHECK-NEXT: add.w r12, r1, r3, lsl #2
+; CHECK-NEXT: subs r1, r0, #4
+; CHECK-NEXT: movs r0, #1
+; CHECK-NEXT: lsls r5, r3, #2
+; CHECK-NEXT: add.w r1, r0, r1, lsr #2
+; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: add.w r1, r3, r3, lsl #2
+; CHECK-NEXT: lsls r1, r1, #2
+; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: .LBB4_2: @ %for.body
+; CHECK-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NEXT: @ Child Loop BB4_3 Depth 2
+; CHECK-NEXT: ldr.w lr, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #4
+; CHECK-NEXT: vmov.i32 q1, #0x0
+; CHECK-NEXT: add.w r10, r0, #2
+; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #3
+; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: ldr r6, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: add.w r11, r0, #1
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: mov r3, r12
+; CHECK-NEXT: vmov q0, q1
+; CHECK-NEXT: vmov q3, q1
+; CHECK-NEXT: vmov q2, q1
+; CHECK-NEXT: vmov q4, q1
+; CHECK-NEXT: .LBB4_3: @ %vector.body
+; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1
+; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT: add.w r8, r3, r5
+; CHECK-NEXT: vctp.32 r1
+; CHECK-NEXT: vpsttt
+; CHECK-NEXT: vldrwt.u32 q5, [r6], #16
+; CHECK-NEXT: vldrwt.u32 q6, [r3], #16
+; CHECK-NEXT: vfmat.f32 q3, q6, q5
+; CHECK-NEXT: add.w r9, r8, r5
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vldrwt.u32 q6, [r8]
+; CHECK-NEXT: vfmat.f32 q4, q6, q5
+; CHECK-NEXT: subs r1, #4
+; CHECK-NEXT: add.w r4, r9, r5
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vldrwt.u32 q6, [r9]
+; CHECK-NEXT: vfmat.f32 q2, q6, q5
+; CHECK-NEXT: adds r7, r4, r5
+; CHECK-NEXT: vpstttt
+; CHECK-NEXT: vldrwt.u32 q6, [r4]
+; CHECK-NEXT: vfmat.f32 q0, q6, q5
+; CHECK-NEXT: vldrwt.u32 q6, [r7]
+; CHECK-NEXT: vfmat.f32 q1, q6, q5
+; CHECK-NEXT: le lr, .LBB4_3
+; CHECK-NEXT: @ %bb.4: @ %middle.block
+; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT: vadd.f32 s20, s18, s19
+; CHECK-NEXT: add.w r1, r2, r11, lsl #2
+; CHECK-NEXT: vadd.f32 s16, s16, s17
+; CHECK-NEXT: vadd.f32 s18, s14, s15
+; CHECK-NEXT: vadd.f32 s12, s12, s13
+; CHECK-NEXT: vadd.f32 s14, s6, s7
+; CHECK-NEXT: vadd.f32 s4, s4, s5
+; CHECK-NEXT: vadd.f32 s6, s10, s11
+; CHECK-NEXT: vadd.f32 s8, s8, s9
+; CHECK-NEXT: vadd.f32 s10, s2, s3
+; CHECK-NEXT: vadd.f32 s0, s0, s1
+; CHECK-NEXT: vadd.f32 s2, s16, s20
+; CHECK-NEXT: vadd.f32 s12, s12, s18
+; CHECK-NEXT: vadd.f32 s4, s4, s14
+; CHECK-NEXT: vadd.f32 s6, s8, s6
+; CHECK-NEXT: vadd.f32 s0, s0, s10
+; CHECK-NEXT: vstr s2, [r1]
+; CHECK-NEXT: add.w r1, r2, r0, lsl #2
+; CHECK-NEXT: adds r0, #5
+; CHECK-NEXT: vstr s12, [r1]
+; CHECK-NEXT: add.w r1, r2, r10, lsl #2
+; CHECK-NEXT: vstr s6, [r1]
+; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: vstr s0, [r1]
+; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: vstr s4, [r1]
+; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: add r12, r1
+; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: cmp r0, r1
+; CHECK-NEXT: blo.w .LBB4_2
+; CHECK-NEXT: .LBB4_5: @ %for.cond.cleanup
+; CHECK-NEXT: add sp, #32
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: add sp, #4
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+entry:
+ %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
+ %0 = load i32, i32* %NumInputs, align 4
+ %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
+ %1 = load i32, i32* %NumFilters, align 4
+ %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
+ %2 = load float*, float** %pDCTCoefs, align 4
+ %cmp = icmp ugt i32 %0, 1
+ tail call void @llvm.assume(i1 %cmp)
+ %sub = add i32 %1, -5
+ %cmp3134 = icmp ugt i32 %sub, 1
+ br i1 %cmp3134, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %n.rnd.up = add i32 %0, 3
+ %n.vec = and i32 %n.rnd.up, -4
+ br label %for.body
+
+for.cond.cleanup: ; preds = %middle.block, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %middle.block
+ %k2.0135 = phi i32 [ %add52, %middle.block ], [ 1, %for.body.preheader ]
+ %mul4 = mul i32 %k2.0135, %0
+ %add = add nuw i32 %k2.0135, 1
+ %mul5 = mul i32 %add, %0
+ %add6 = add i32 %k2.0135, 2
+ %mul7 = mul i32 %add6, %0
+ %add8 = add i32 %k2.0135, 3
+ %mul9 = mul i32 %add8, %0
+ %add10 = add i32 %k2.0135, 4
+ %mul11 = mul i32 %add10, %0
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %for.body
+ %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
+ %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %30, %vector.body ]
+ %vec.phi136 = phi <4 x float> [ zeroinitializer, %for.body ], [ %31, %vector.body ]
+ %vec.phi137 = phi <4 x float> [ zeroinitializer, %for.body ], [ %32, %vector.body ]
+ %vec.phi138 = phi <4 x float> [ zeroinitializer, %for.body ], [ %33, %vector.body ]
+ %vec.phi139 = phi <4 x float> [ zeroinitializer, %for.body ], [ %34, %vector.body ]
+ %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
+ %3 = getelementptr inbounds float, float* %pIn, i32 %index
+ %4 = bitcast float* %3 to <4 x float>*
+ %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %5 = add i32 %index, %mul4
+ %6 = getelementptr inbounds float, float* %2, i32 %5
+ %7 = bitcast float* %6 to <4 x float>*
+ %wide.masked.load140 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %8 = fmul fast <4 x float> %wide.masked.load140, %wide.masked.load
+ %9 = fadd fast <4 x float> %8, %vec.phi137
+ %10 = add i32 %index, %mul5
+ %11 = getelementptr inbounds float, float* %2, i32 %10
+ %12 = bitcast float* %11 to <4 x float>*
+ %wide.masked.load141 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %13 = fmul fast <4 x float> %wide.masked.load141, %wide.masked.load
+ %14 = fadd fast <4 x float> %13, %vec.phi139
+ %15 = add i32 %index, %mul7
+ %16 = getelementptr inbounds float, float* %2, i32 %15
+ %17 = bitcast float* %16 to <4 x float>*
+ %wide.masked.load142 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %18 = fmul fast <4 x float> %wide.masked.load142, %wide.masked.load
+ %19 = fadd fast <4 x float> %18, %vec.phi138
+ %20 = add i32 %index, %mul9
+ %21 = getelementptr inbounds float, float* %2, i32 %20
+ %22 = bitcast float* %21 to <4 x float>*
+ %wide.masked.load143 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %23 = fmul fast <4 x float> %wide.masked.load143, %wide.masked.load
+ %24 = fadd fast <4 x float> %23, %vec.phi136
+ %25 = add i32 %index, %mul11
+ %26 = getelementptr inbounds float, float* %2, i32 %25
+ %27 = bitcast float* %26 to <4 x float>*
+ %wide.masked.load144 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %28 = fmul fast <4 x float> %wide.masked.load144, %wide.masked.load
+ %29 = fadd fast <4 x float> %28, %vec.phi
+ %30 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi
+ %31 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi136
+ %32 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi137
+ %33 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi138
+ %34 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi139
+ %index.next = add i32 %index, 4
+ %35 = icmp eq i32 %index.next, %n.vec
+ br i1 %35, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %36 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %34)
+ %37 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %33)
+ %38 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %32)
+ %39 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %31)
+ %40 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %30)
+ %arrayidx42 = getelementptr inbounds float, float* %pOut, i32 %k2.0135
+ store float %38, float* %arrayidx42, align 4
+ %arrayidx44 = getelementptr inbounds float, float* %pOut, i32 %add
+ store float %36, float* %arrayidx44, align 4
+ %arrayidx46 = getelementptr inbounds float, float* %pOut, i32 %add6
+ store float %37, float* %arrayidx46, align 4
+ %arrayidx48 = getelementptr inbounds float, float* %pOut, i32 %add8
+ store float %39, float* %arrayidx48, align 4
+ %arrayidx50 = getelementptr inbounds float, float* %pOut, i32 %add10
+ store float %40, float* %arrayidx50, align 4
+ %add52 = add i32 %k2.0135, 5
+ %cmp3 = icmp ult i32 %add52, %sub
+ br i1 %cmp3, label %for.body, label %for.cond.cleanup
+}
+
+define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
+; CHECK-LABEL: DCT_mve6:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: .pad #4
+; CHECK-NEXT: sub sp, #4
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: .pad #32
+; CHECK-NEXT: sub sp, #32
+; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: ldr r1, [r0, #4]
+; CHECK-NEXT: subs r1, #6
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: cmp r1, #2
+; CHECK-NEXT: blo.w .LBB5_5
+; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT: ldr r3, [r0, #8]
+; CHECK-NEXT: ldr r1, [r0]
+; CHECK-NEXT: adds r0, r3, #3
+; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: bic r0, r0, #3
+; CHECK-NEXT: add.w r8, r1, r3, lsl #2
+; CHECK-NEXT: subs r1, r0, #4
+; CHECK-NEXT: movs r0, #1
+; CHECK-NEXT: lsls r5, r3, #2
+; CHECK-NEXT: add.w r1, r0, r1, lsr #2
+; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: add.w r1, r3, r3, lsl #1
+; CHECK-NEXT: lsls r1, r1, #3
+; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
+; CHECK-NEXT: .LBB5_2: @ %for.body
+; CHECK-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NEXT: @ Child Loop BB5_3 Depth 2
+; CHECK-NEXT: ldr.w lr, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #5
+; CHECK-NEXT: vmov.i32 q1, #0x0
+; CHECK-NEXT: add.w r11, r0, #2
+; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #4
+; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #3
+; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: adds r6, r0, #1
+; CHECK-NEXT: ldr.w r12, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: mov r3, r8
+; CHECK-NEXT: vmov q3, q1
+; CHECK-NEXT: vmov q4, q1
+; CHECK-NEXT: vmov q0, q1
+; CHECK-NEXT: vmov q5, q1
+; CHECK-NEXT: vmov q2, q1
+; CHECK-NEXT: .LBB5_3: @ %vector.body
+; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1
+; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT: add.w r9, r3, r5
+; CHECK-NEXT: vctp.32 r12
+; CHECK-NEXT: vpsttt
+; CHECK-NEXT: vldrwt.u32 q6, [r1], #16
+; CHECK-NEXT: vldrwt.u32 q7, [r3], #16
+; CHECK-NEXT: vfmat.f32 q4, q7, q6
+; CHECK-NEXT: add.w r10, r9, r5
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vldrwt.u32 q7, [r9]
+; CHECK-NEXT: vfmat.f32 q5, q7, q6
+; CHECK-NEXT: add.w r4, r10, r5
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vldrwt.u32 q7, [r10]
+; CHECK-NEXT: vfmat.f32 q2, q7, q6
+; CHECK-NEXT: sub.w r12, r12, #4
+; CHECK-NEXT: adds r7, r4, r5
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vldrwt.u32 q7, [r4]
+; CHECK-NEXT: vfmat.f32 q0, q7, q6
+; CHECK-NEXT: adds r4, r7, r5
+; CHECK-NEXT: vpstttt
+; CHECK-NEXT: vldrwt.u32 q7, [r7]
+; CHECK-NEXT: vfmat.f32 q3, q7, q6
+; CHECK-NEXT: vldrwt.u32 q7, [r4]
+; CHECK-NEXT: vfmat.f32 q1, q7, q6
+; CHECK-NEXT: le lr, .LBB5_3
+; CHECK-NEXT: @ %bb.4: @ %middle.block
+; CHECK-NEXT: @ in Loop: Header=BB5_2 Depth=1
+; CHECK-NEXT: vadd.f32 s24, s22, s23
+; CHECK-NEXT: add.w r1, r2, r6, lsl #2
+; CHECK-NEXT: vadd.f32 s20, s20, s21
+; CHECK-NEXT: vadd.f32 s22, s18, s19
+; CHECK-NEXT: vadd.f32 s16, s16, s17
+; CHECK-NEXT: vadd.f32 s18, s6, s7
+; CHECK-NEXT: vadd.f32 s4, s4, s5
+; CHECK-NEXT: vadd.f32 s6, s14, s15
+; CHECK-NEXT: vadd.f32 s12, s12, s13
+; CHECK-NEXT: vadd.f32 s14, s10, s11
+; CHECK-NEXT: vadd.f32 s8, s8, s9
+; CHECK-NEXT: vadd.f32 s0, s0, s1
+; CHECK-NEXT: vadd.f32 s10, s2, s3
+; CHECK-NEXT: vadd.f32 s2, s20, s24
+; CHECK-NEXT: vadd.f32 s1, s16, s22
+; CHECK-NEXT: vadd.f32 s6, s12, s6
+; CHECK-NEXT: vadd.f32 s4, s4, s18
+; CHECK-NEXT: vadd.f32 s8, s8, s14
+; CHECK-NEXT: vadd.f32 s0, s0, s10
+; CHECK-NEXT: vstr s2, [r1]
+; CHECK-NEXT: add.w r1, r2, r0, lsl #2
+; CHECK-NEXT: adds r0, #6
+; CHECK-NEXT: vstr s1, [r1]
+; CHECK-NEXT: add.w r1, r2, r11, lsl #2
+; CHECK-NEXT: vstr s8, [r1]
+; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: vstr s0, [r1]
+; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: vstr s6, [r1]
+; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: vstr s4, [r1]
+; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
+; CHECK-NEXT: add r8, r1
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: cmp r0, r1
+; CHECK-NEXT: blo.w .LBB5_2
+; CHECK-NEXT: .LBB5_5: @ %for.cond.cleanup
+; CHECK-NEXT: add sp, #32
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: add sp, #4
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+entry:
+ %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
+ %0 = load i32, i32* %NumInputs, align 4
+ %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
+ %1 = load i32, i32* %NumFilters, align 4
+ %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
+ %2 = load float*, float** %pDCTCoefs, align 4
+ %cmp = icmp ugt i32 %0, 1
+ tail call void @llvm.assume(i1 %cmp)
+ %sub = add i32 %1, -6
+ %cmp3155 = icmp ugt i32 %sub, 1
+ br i1 %cmp3155, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %n.rnd.up = add i32 %0, 3
+ %n.vec = and i32 %n.rnd.up, -4
+ br label %for.body
+
+for.cond.cleanup: ; preds = %middle.block, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %middle.block
+ %k2.0156 = phi i32 [ %add61, %middle.block ], [ 1, %for.body.preheader ]
+ %mul4 = mul i32 %k2.0156, %0
+ %add = add nuw i32 %k2.0156, 1
+ %mul5 = mul i32 %add, %0
+ %add6 = add i32 %k2.0156, 2
+ %mul7 = mul i32 %add6, %0
+ %add8 = add i32 %k2.0156, 3
+ %mul9 = mul i32 %add8, %0
+ %add10 = add i32 %k2.0156, 4
+ %mul11 = mul i32 %add10, %0
+ %add12 = add i32 %k2.0156, 5
+ %mul13 = mul i32 %add12, %0
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %for.body
+ %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
+ %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %35, %vector.body ]
+ %vec.phi157 = phi <4 x float> [ zeroinitializer, %for.body ], [ %36, %vector.body ]
+ %vec.phi158 = phi <4 x float> [ zeroinitializer, %for.body ], [ %37, %vector.body ]
+ %vec.phi159 = phi <4 x float> [ zeroinitializer, %for.body ], [ %38, %vector.body ]
+ %vec.phi160 = phi <4 x float> [ zeroinitializer, %for.body ], [ %39, %vector.body ]
+ %vec.phi161 = phi <4 x float> [ zeroinitializer, %for.body ], [ %40, %vector.body ]
+ %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
+ %3 = getelementptr inbounds float, float* %pIn, i32 %index
+ %4 = bitcast float* %3 to <4 x float>*
+ %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %5 = add i32 %index, %mul4
+ %6 = getelementptr inbounds float, float* %2, i32 %5
+ %7 = bitcast float* %6 to <4 x float>*
+ %wide.masked.load162 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %8 = fmul fast <4 x float> %wide.masked.load162, %wide.masked.load
+ %9 = fadd fast <4 x float> %8, %vec.phi158
+ %10 = add i32 %index, %mul5
+ %11 = getelementptr inbounds float, float* %2, i32 %10
+ %12 = bitcast float* %11 to <4 x float>*
+ %wide.masked.load163 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %13 = fmul fast <4 x float> %wide.masked.load163, %wide.masked.load
+ %14 = fadd fast <4 x float> %13, %vec.phi160
+ %15 = add i32 %index, %mul7
+ %16 = getelementptr inbounds float, float* %2, i32 %15
+ %17 = bitcast float* %16 to <4 x float>*
+ %wide.masked.load164 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %18 = fmul fast <4 x float> %wide.masked.load164, %wide.masked.load
+ %19 = fadd fast <4 x float> %18, %vec.phi161
+ %20 = add i32 %index, %mul9
+ %21 = getelementptr inbounds float, float* %2, i32 %20
+ %22 = bitcast float* %21 to <4 x float>*
+ %wide.masked.load165 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %23 = fmul fast <4 x float> %wide.masked.load165, %wide.masked.load
+ %24 = fadd fast <4 x float> %23, %vec.phi159
+ %25 = add i32 %index, %mul11
+ %26 = getelementptr inbounds float, float* %2, i32 %25
+ %27 = bitcast float* %26 to <4 x float>*
+ %wide.masked.load166 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %28 = fmul fast <4 x float> %wide.masked.load166, %wide.masked.load
+ %29 = fadd fast <4 x float> %28, %vec.phi157
+ %30 = add i32 %index, %mul13
+ %31 = getelementptr inbounds float, float* %2, i32 %30
+ %32 = bitcast float* %31 to <4 x float>*
+ %wide.masked.load167 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %33 = fmul fast <4 x float> %wide.masked.load167, %wide.masked.load
+ %34 = fadd fast <4 x float> %33, %vec.phi
+ %35 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi
+ %36 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi157
+ %37 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi158
+ %38 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi159
+ %39 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi160
+ %40 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi161
+ %index.next = add i32 %index, 4
+ %41 = icmp eq i32 %index.next, %n.vec
+ br i1 %41, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %42 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %40)
+ %43 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %39)
+ %44 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %38)
+ %45 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %37)
+ %46 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %36)
+ %47 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %35)
+ %arrayidx49 = getelementptr inbounds float, float* %pOut, i32 %k2.0156
+ store float %45, float* %arrayidx49, align 4
+ %arrayidx51 = getelementptr inbounds float, float* %pOut, i32 %add
+ store float %43, float* %arrayidx51, align 4
+ %arrayidx53 = getelementptr inbounds float, float* %pOut, i32 %add6
+ store float %42, float* %arrayidx53, align 4
+ %arrayidx55 = getelementptr inbounds float, float* %pOut, i32 %add8
+ store float %44, float* %arrayidx55, align 4
+ %arrayidx57 = getelementptr inbounds float, float* %pOut, i32 %add10
+ store float %46, float* %arrayidx57, align 4
+ %arrayidx59 = getelementptr inbounds float, float* %pOut, i32 %add12
+ store float %47, float* %arrayidx59, align 4
+ %add61 = add i32 %k2.0156, 6
+ %cmp3 = icmp ult i32 %add61, %sub
+ br i1 %cmp3, label %for.body, label %for.cond.cleanup
+}
+
+define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
+; CHECK-LABEL: DCT_mve7:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: .pad #4
+; CHECK-NEXT: sub sp, #4
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: .pad #88
+; CHECK-NEXT: sub sp, #88
+; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT: ldr r1, [r0, #4]
+; CHECK-NEXT: subs r1, #7
+; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: cmp r1, #2
+; CHECK-NEXT: blo.w .LBB6_5
+; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT: ldr r3, [r0, #8]
+; CHECK-NEXT: ldr r1, [r0]
+; CHECK-NEXT: adds r0, r3, #3
+; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: bic r0, r0, #3
+; CHECK-NEXT: add.w r12, r1, r3, lsl #2
+; CHECK-NEXT: subs r1, r0, #4
+; CHECK-NEXT: movs r0, #1
+; CHECK-NEXT: lsls r5, r3, #2
+; CHECK-NEXT: add.w r1, r0, r1, lsr #2
+; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: rsb r1, r3, r3, lsl #3
+; CHECK-NEXT: lsls r1, r1, #2
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: .LBB6_2: @ %for.body
+; CHECK-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NEXT: @ Child Loop BB6_3 Depth 2
+; CHECK-NEXT: ldr.w lr, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #6
+; CHECK-NEXT: vmov.i32 q2, #0x0
+; CHECK-NEXT: adds r6, r0, #2
+; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #5
+; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #4
+; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #3
+; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill
+; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: add.w r8, r0, #1
+; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: mov r3, r12
+; CHECK-NEXT: vmov q4, q2
+; CHECK-NEXT: vmov q5, q2
+; CHECK-NEXT: vmov q3, q2
+; CHECK-NEXT: vmov q6, q2
+; CHECK-NEXT: vmov q1, q2
+; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT: .LBB6_3: @ %vector.body
+; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1
+; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT: add.w r10, r3, r5
+; CHECK-NEXT: vctp.32 r9
+; CHECK-NEXT: vpsttt
+; CHECK-NEXT: vldrwt.u32 q7, [r1], #16
+; CHECK-NEXT: vldrwt.u32 q0, [r3], #16
+; CHECK-NEXT: vfmat.f32 q5, q0, q7
+; CHECK-NEXT: add.w r11, r10, r5
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vldrwt.u32 q0, [r10]
+; CHECK-NEXT: vfmat.f32 q6, q0, q7
+; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vldrwt.u32 q0, [r11]
+; CHECK-NEXT: vfmat.f32 q1, q0, q7
+; CHECK-NEXT: add.w r4, r11, r5
+; CHECK-NEXT: vmov q6, q5
+; CHECK-NEXT: vmov q5, q4
+; CHECK-NEXT: vmov q4, q2
+; CHECK-NEXT: vmov q2, q3
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vldrwt.u32 q0, [r4]
+; CHECK-NEXT: vmov q3, q1
+; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: adds r7, r4, r5
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vfmat.f32 q1, q0, q7
+; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT: vmov q1, q3
+; CHECK-NEXT: vmov q3, q2
+; CHECK-NEXT: vmov q2, q4
+; CHECK-NEXT: vmov q4, q5
+; CHECK-NEXT: vmov q5, q6
+; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT: sub.w r9, r9, #4
+; CHECK-NEXT: adds r4, r7, r5
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vldrwt.u32 q0, [r7]
+; CHECK-NEXT: vfmat.f32 q3, q0, q7
+; CHECK-NEXT: adds r7, r4, r5
+; CHECK-NEXT: vpstttt
+; CHECK-NEXT: vldrwt.u32 q0, [r4]
+; CHECK-NEXT: vfmat.f32 q4, q0, q7
+; CHECK-NEXT: vldrwt.u32 q0, [r7]
+; CHECK-NEXT: vfmat.f32 q2, q0, q7
+; CHECK-NEXT: le lr, .LBB6_3
+; CHECK-NEXT: @ %bb.4: @ %middle.block
+; CHECK-NEXT: @ in Loop: Header=BB6_2 Depth=1
+; CHECK-NEXT: vadd.f32 s0, s26, s27
+; CHECK-NEXT: add.w r1, r2, r8, lsl #2
+; CHECK-NEXT: vadd.f32 s2, s24, s25
+; CHECK-NEXT: vadd.f32 s3, s20, s21
+; CHECK-NEXT: vadd.f32 s1, s22, s23
+; CHECK-NEXT: vadd.f32 s8, s8, s9
+; CHECK-NEXT: vadd.f32 s20, s10, s11
+; CHECK-NEXT: vadd.f32 s11, s14, s15
+; CHECK-NEXT: vadd.f32 s12, s12, s13
+; CHECK-NEXT: vadd.f32 s14, s6, s7
+; CHECK-NEXT: vadd.f32 s4, s4, s5
+; CHECK-NEXT: vadd.f32 s0, s2, s0
+; CHECK-NEXT: vadd.f32 s10, s18, s19
+; CHECK-NEXT: vadd.f32 s9, s16, s17
+; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: vadd.f32 s2, s3, s1
+; CHECK-NEXT: vadd.f32 s6, s18, s19
+; CHECK-NEXT: vadd.f32 s5, s16, s17
+; CHECK-NEXT: vadd.f32 s4, s4, s14
+; CHECK-NEXT: vstr s0, [r1]
+; CHECK-NEXT: add.w r1, r2, r0, lsl #2
+; CHECK-NEXT: vadd.f32 s12, s12, s11
+; CHECK-NEXT: adds r0, #7
+; CHECK-NEXT: vadd.f32 s10, s9, s10
+; CHECK-NEXT: vstr s2, [r1]
+; CHECK-NEXT: add.w r1, r2, r6, lsl #2
+; CHECK-NEXT: vadd.f32 s8, s8, s20
+; CHECK-NEXT: vadd.f32 s6, s5, s6
+; CHECK-NEXT: vstr s4, [r1]
+; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: vstr s6, [r1]
+; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: vstr s12, [r1]
+; CHECK-NEXT: ldr r1, [sp, #40] @ 4-byte Reload
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: vstr s10, [r1]
+; CHECK-NEXT: ldr r1, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: vstr s8, [r1]
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: add r12, r1
+; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: cmp r0, r1
+; CHECK-NEXT: blo.w .LBB6_2
+; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup
+; CHECK-NEXT: add sp, #88
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: add sp, #4
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+entry:
+ %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
+ %0 = load i32, i32* %NumInputs, align 4
+ %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
+ %1 = load i32, i32* %NumFilters, align 4
+ %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
+ %2 = load float*, float** %pDCTCoefs, align 4
+ %cmp = icmp ugt i32 %0, 1
+ tail call void @llvm.assume(i1 %cmp)
+ %sub = add i32 %1, -7
+ %cmp3176 = icmp ugt i32 %sub, 1
+ br i1 %cmp3176, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %n.rnd.up = add i32 %0, 3
+ %n.vec = and i32 %n.rnd.up, -4
+ br label %for.body
+
+for.cond.cleanup: ; preds = %middle.block, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %middle.block
+ %k2.0177 = phi i32 [ %add70, %middle.block ], [ 1, %for.body.preheader ]
+ %mul4 = mul i32 %k2.0177, %0
+ %add = add nuw i32 %k2.0177, 1
+ %mul5 = mul i32 %add, %0
+ %add6 = add i32 %k2.0177, 2
+ %mul7 = mul i32 %add6, %0
+ %add8 = add i32 %k2.0177, 3
+ %mul9 = mul i32 %add8, %0
+ %add10 = add i32 %k2.0177, 4
+ %mul11 = mul i32 %add10, %0
+ %add12 = add i32 %k2.0177, 5
+ %mul13 = mul i32 %add12, %0
+ %add14 = add i32 %k2.0177, 6
+ %mul15 = mul i32 %add14, %0
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %for.body
+ %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
+ %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %40, %vector.body ]
+ %vec.phi178 = phi <4 x float> [ zeroinitializer, %for.body ], [ %41, %vector.body ]
+ %vec.phi179 = phi <4 x float> [ zeroinitializer, %for.body ], [ %42, %vector.body ]
+ %vec.phi180 = phi <4 x float> [ zeroinitializer, %for.body ], [ %43, %vector.body ]
+ %vec.phi181 = phi <4 x float> [ zeroinitializer, %for.body ], [ %44, %vector.body ]
+ %vec.phi182 = phi <4 x float> [ zeroinitializer, %for.body ], [ %45, %vector.body ]
+ %vec.phi183 = phi <4 x float> [ zeroinitializer, %for.body ], [ %46, %vector.body ]
+ %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
+ %3 = getelementptr inbounds float, float* %pIn, i32 %index
+ %4 = bitcast float* %3 to <4 x float>*
+ %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %5 = add i32 %index, %mul4
+ %6 = getelementptr inbounds float, float* %2, i32 %5
+ %7 = bitcast float* %6 to <4 x float>*
+ %wide.masked.load184 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %8 = fmul fast <4 x float> %wide.masked.load184, %wide.masked.load
+ %9 = fadd fast <4 x float> %8, %vec.phi179
+ %10 = add i32 %index, %mul5
+ %11 = getelementptr inbounds float, float* %2, i32 %10
+ %12 = bitcast float* %11 to <4 x float>*
+ %wide.masked.load185 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %13 = fmul fast <4 x float> %wide.masked.load185, %wide.masked.load
+ %14 = fadd fast <4 x float> %13, %vec.phi181
+ %15 = add i32 %index, %mul7
+ %16 = getelementptr inbounds float, float* %2, i32 %15
+ %17 = bitcast float* %16 to <4 x float>*
+ %wide.masked.load186 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %18 = fmul fast <4 x float> %wide.masked.load186, %wide.masked.load
+ %19 = fadd fast <4 x float> %18, %vec.phi183
+ %20 = add i32 %index, %mul9
+ %21 = getelementptr inbounds float, float* %2, i32 %20
+ %22 = bitcast float* %21 to <4 x float>*
+ %wide.masked.load187 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %23 = fmul fast <4 x float> %wide.masked.load187, %wide.masked.load
+ %24 = fadd fast <4 x float> %23, %vec.phi182
+ %25 = add i32 %index, %mul11
+ %26 = getelementptr inbounds float, float* %2, i32 %25
+ %27 = bitcast float* %26 to <4 x float>*
+ %wide.masked.load188 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %28 = fmul fast <4 x float> %wide.masked.load188, %wide.masked.load
+ %29 = fadd fast <4 x float> %28, %vec.phi180
+ %30 = add i32 %index, %mul13
+ %31 = getelementptr inbounds float, float* %2, i32 %30
+ %32 = bitcast float* %31 to <4 x float>*
+ %wide.masked.load189 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %33 = fmul fast <4 x float> %wide.masked.load189, %wide.masked.load
+ %34 = fadd fast <4 x float> %33, %vec.phi178
+ %35 = add i32 %index, %mul15
+ %36 = getelementptr inbounds float, float* %2, i32 %35
+ %37 = bitcast float* %36 to <4 x float>*
+ %wide.masked.load190 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %37, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %38 = fmul fast <4 x float> %wide.masked.load190, %wide.masked.load
+ %39 = fadd fast <4 x float> %38, %vec.phi
+ %40 = select <4 x i1> %active.lane.mask, <4 x float> %39, <4 x float> %vec.phi
+ %41 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi178
+ %42 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi179
+ %43 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi180
+ %44 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi181
+ %45 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi182
+ %46 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi183
+ %index.next = add i32 %index, 4
+ %47 = icmp eq i32 %index.next, %n.vec
+ br i1 %47, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %48 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %46)
+ %49 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %45)
+ %50 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %44)
+ %51 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %43)
+ %52 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %42)
+ %53 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %41)
+ %54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %40)
+ %arrayidx56 = getelementptr inbounds float, float* %pOut, i32 %k2.0177
+ store float %52, float* %arrayidx56, align 4
+ %arrayidx58 = getelementptr inbounds float, float* %pOut, i32 %add
+ store float %50, float* %arrayidx58, align 4
+ %arrayidx60 = getelementptr inbounds float, float* %pOut, i32 %add6
+ store float %48, float* %arrayidx60, align 4
+ %arrayidx62 = getelementptr inbounds float, float* %pOut, i32 %add8
+ store float %49, float* %arrayidx62, align 4
+ %arrayidx64 = getelementptr inbounds float, float* %pOut, i32 %add10
+ store float %51, float* %arrayidx64, align 4
+ %arrayidx66 = getelementptr inbounds float, float* %pOut, i32 %add12
+ store float %53, float* %arrayidx66, align 4
+ %arrayidx68 = getelementptr inbounds float, float* %pOut, i32 %add14
+ store float %54, float* %arrayidx68, align 4
+ %add70 = add i32 %k2.0177, 7
+ %cmp3 = icmp ult i32 %add70, %sub
+ br i1 %cmp3, label %for.body, label %for.cond.cleanup
+}
+
+define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
+; CHECK-LABEL: DCT_mve8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: .pad #4
+; CHECK-NEXT: sub sp, #4
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: .pad #104
+; CHECK-NEXT: sub sp, #104
+; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT: ldr r1, [r0, #4]
+; CHECK-NEXT: subs r1, #8
+; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: cmp r1, #2
+; CHECK-NEXT: blo.w .LBB7_5
+; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT: ldr r3, [r0, #8]
+; CHECK-NEXT: ldr r1, [r0]
+; CHECK-NEXT: adds r0, r3, #3
+; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: bic r0, r0, #3
+; CHECK-NEXT: add.w r9, r1, r3, lsl #2
+; CHECK-NEXT: subs r1, r0, #4
+; CHECK-NEXT: movs r0, #1
+; CHECK-NEXT: lsls r5, r3, #2
+; CHECK-NEXT: add.w r1, r0, r1, lsr #2
+; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: lsls r1, r3, #5
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: .LBB7_2: @ %for.body
+; CHECK-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NEXT: @ Child Loop BB7_3 Depth 2
+; CHECK-NEXT: ldr.w lr, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #7
+; CHECK-NEXT: vmov.i32 q2, #0x0
+; CHECK-NEXT: adds r6, r0, #3
+; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #6
+; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #5
+; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #4
+; CHECK-NEXT: ldr.w r12, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: ldr.w r10, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: add.w r8, r0, #2
+; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill
+; CHECK-NEXT: adds r1, r0, #1
+; CHECK-NEXT: mov r3, r9
+; CHECK-NEXT: vmov q5, q2
+; CHECK-NEXT: vmov q6, q2
+; CHECK-NEXT: vmov q4, q2
+; CHECK-NEXT: vmov q7, q2
+; CHECK-NEXT: vmov q3, q2
+; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT: .LBB7_3: @ %vector.body
+; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1
+; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT: add.w r11, r3, r5
+; CHECK-NEXT: vctp.32 r10
+; CHECK-NEXT: vpsttt
+; CHECK-NEXT: vldrwt.u32 q0, [r12], #16
+; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
+; CHECK-NEXT: vfmat.f32 q6, q1, q0
+; CHECK-NEXT: add.w r4, r11, r5
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vldrwt.u32 q1, [r11]
+; CHECK-NEXT: vfmat.f32 q7, q1, q0
+; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT: vmov q7, q6
+; CHECK-NEXT: vmov q6, q5
+; CHECK-NEXT: vmov q5, q4
+; CHECK-NEXT: vmov q4, q2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vldrwt.u32 q1, [r4]
+; CHECK-NEXT: vmov q2, q3
+; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: adds r7, r4, r5
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vfmat.f32 q3, q1, q0
+; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vldrwt.u32 q1, [r7]
+; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vfmat.f32 q3, q1, q0
+; CHECK-NEXT: adds r4, r7, r5
+; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT: vmov q3, q2
+; CHECK-NEXT: vmov q2, q4
+; CHECK-NEXT: vmov q4, q5
+; CHECK-NEXT: vmov q5, q6
+; CHECK-NEXT: vmov q6, q7
+; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT: adds r7, r4, r5
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vldrwt.u32 q1, [r4]
+; CHECK-NEXT: vfmat.f32 q3, q1, q0
+; CHECK-NEXT: sub.w r10, r10, #4
+; CHECK-NEXT: adds r4, r7, r5
+; CHECK-NEXT: vpstttt
+; CHECK-NEXT: vldrwt.u32 q1, [r7]
+; CHECK-NEXT: vfmat.f32 q4, q1, q0
+; CHECK-NEXT: vldrwt.u32 q1, [r4]
+; CHECK-NEXT: vfmat.f32 q5, q1, q0
+; CHECK-NEXT: add r4, r5
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vldrwt.u32 q1, [r4]
+; CHECK-NEXT: vfmat.f32 q2, q1, q0
+; CHECK-NEXT: le lr, .LBB7_3
+; CHECK-NEXT: @ %bb.4: @ %middle.block
+; CHECK-NEXT: @ in Loop: Header=BB7_2 Depth=1
+; CHECK-NEXT: vadd.f32 s0, s30, s31
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: vadd.f32 s2, s28, s29
+; CHECK-NEXT: vadd.f32 s8, s8, s9
+; CHECK-NEXT: vadd.f32 s5, s10, s11
+; CHECK-NEXT: vadd.f32 s4, s26, s27
+; CHECK-NEXT: vadd.f32 s6, s24, s25
+; CHECK-NEXT: vadd.f32 s10, s18, s19
+; CHECK-NEXT: vadd.f32 s7, s16, s17
+; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: vadd.f32 s9, s14, s15
+; CHECK-NEXT: vadd.f32 s12, s12, s13
+; CHECK-NEXT: vadd.f32 s14, s18, s19
+; CHECK-NEXT: vadd.f32 s11, s16, s17
+; CHECK-NEXT: vldrw.u32 q4, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT: vadd.f32 s0, s2, s0
+; CHECK-NEXT: vadd.f32 s13, s18, s19
+; CHECK-NEXT: vadd.f32 s15, s16, s17
+; CHECK-NEXT: vadd.f32 s2, s6, s4
+; CHECK-NEXT: vadd.f32 s6, s8, s5
+; CHECK-NEXT: vadd.f32 s8, s7, s10
+; CHECK-NEXT: vadd.f32 s10, s12, s9
+; CHECK-NEXT: vadd.f32 s12, s11, s14
+; CHECK-NEXT: vstr s0, [r1]
+; CHECK-NEXT: add.w r1, r2, r0, lsl #2
+; CHECK-NEXT: vadd.f32 s1, s22, s23
+; CHECK-NEXT: vadd.f32 s14, s15, s13
+; CHECK-NEXT: adds r0, #8
+; CHECK-NEXT: vstr s2, [r1]
+; CHECK-NEXT: add.w r1, r2, r8, lsl #2
+; CHECK-NEXT: vadd.f32 s3, s20, s21
+; CHECK-NEXT: vstr s12, [r1]
+; CHECK-NEXT: add.w r1, r2, r6, lsl #2
+; CHECK-NEXT: vstr s14, [r1]
+; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT: vadd.f32 s4, s3, s1
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: vstr s10, [r1]
+; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: vstr s8, [r1]
+; CHECK-NEXT: ldr r1, [sp, #40] @ 4-byte Reload
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: vstr s4, [r1]
+; CHECK-NEXT: ldr r1, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: vstr s6, [r1]
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: add r9, r1
+; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: cmp r0, r1
+; CHECK-NEXT: blo.w .LBB7_2
+; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup
+; CHECK-NEXT: add sp, #104
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: add sp, #4
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+entry:
+ %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
+ %0 = load i32, i32* %NumInputs, align 4
+ %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
+ %1 = load i32, i32* %NumFilters, align 4
+ %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
+ %2 = load float*, float** %pDCTCoefs, align 4
+ %cmp = icmp ugt i32 %0, 1
+ tail call void @llvm.assume(i1 %cmp)
+ %sub = add i32 %1, -8
+ %cmp3197 = icmp ugt i32 %sub, 1
+ br i1 %cmp3197, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %n.rnd.up = add i32 %0, 3
+ %n.vec = and i32 %n.rnd.up, -4
+ br label %for.body
+
+for.cond.cleanup: ; preds = %middle.block, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %middle.block
+ %k2.0198 = phi i32 [ %add79, %middle.block ], [ 1, %for.body.preheader ]
+ %mul4 = mul i32 %k2.0198, %0
+ %add = add nuw nsw i32 %k2.0198, 1
+ %mul5 = mul i32 %add, %0
+ %add6 = add nuw nsw i32 %k2.0198, 2
+ %mul7 = mul i32 %add6, %0
+ %add8 = add nuw nsw i32 %k2.0198, 3
+ %mul9 = mul i32 %add8, %0
+ %add10 = add nuw nsw i32 %k2.0198, 4
+ %mul11 = mul i32 %add10, %0
+ %add12 = add nuw nsw i32 %k2.0198, 5
+ %mul13 = mul i32 %add12, %0
+ %add14 = add nuw nsw i32 %k2.0198, 6
+ %mul15 = mul i32 %add14, %0
+ %add16 = add i32 %k2.0198, 7
+ %mul17 = mul i32 %add16, %0
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %for.body
+ %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
+ %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %45, %vector.body ]
+ %vec.phi199 = phi <4 x float> [ zeroinitializer, %for.body ], [ %46, %vector.body ]
+ %vec.phi200 = phi <4 x float> [ zeroinitializer, %for.body ], [ %47, %vector.body ]
+ %vec.phi201 = phi <4 x float> [ zeroinitializer, %for.body ], [ %48, %vector.body ]
+ %vec.phi202 = phi <4 x float> [ zeroinitializer, %for.body ], [ %49, %vector.body ]
+ %vec.phi203 = phi <4 x float> [ zeroinitializer, %for.body ], [ %50, %vector.body ]
+ %vec.phi204 = phi <4 x float> [ zeroinitializer, %for.body ], [ %51, %vector.body ]
+ %vec.phi205 = phi <4 x float> [ zeroinitializer, %for.body ], [ %52, %vector.body ]
+ %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
+ %3 = getelementptr inbounds float, float* %pIn, i32 %index
+ %4 = bitcast float* %3 to <4 x float>*
+ %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %5 = add i32 %index, %mul4
+ %6 = getelementptr inbounds float, float* %2, i32 %5
+ %7 = bitcast float* %6 to <4 x float>*
+ %wide.masked.load206 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %8 = fmul fast <4 x float> %wide.masked.load206, %wide.masked.load
+ %9 = fadd fast <4 x float> %8, %vec.phi200
+ %10 = add i32 %index, %mul5
+ %11 = getelementptr inbounds float, float* %2, i32 %10
+ %12 = bitcast float* %11 to <4 x float>*
+ %wide.masked.load207 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %13 = fmul fast <4 x float> %wide.masked.load207, %wide.masked.load
+ %14 = fadd fast <4 x float> %13, %vec.phi202
+ %15 = add i32 %index, %mul7
+ %16 = getelementptr inbounds float, float* %2, i32 %15
+ %17 = bitcast float* %16 to <4 x float>*
+ %wide.masked.load208 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %18 = fmul fast <4 x float> %wide.masked.load208, %wide.masked.load
+ %19 = fadd fast <4 x float> %18, %vec.phi204
+ %20 = add i32 %index, %mul9
+ %21 = getelementptr inbounds float, float* %2, i32 %20
+ %22 = bitcast float* %21 to <4 x float>*
+ %wide.masked.load209 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %23 = fmul fast <4 x float> %wide.masked.load209, %wide.masked.load
+ %24 = fadd fast <4 x float> %23, %vec.phi205
+ %25 = add i32 %index, %mul11
+ %26 = getelementptr inbounds float, float* %2, i32 %25
+ %27 = bitcast float* %26 to <4 x float>*
+ %wide.masked.load210 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %28 = fmul fast <4 x float> %wide.masked.load210, %wide.masked.load
+ %29 = fadd fast <4 x float> %28, %vec.phi203
+ %30 = add i32 %index, %mul13
+ %31 = getelementptr inbounds float, float* %2, i32 %30
+ %32 = bitcast float* %31 to <4 x float>*
+ %wide.masked.load211 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %33 = fmul fast <4 x float> %wide.masked.load211, %wide.masked.load
+ %34 = fadd fast <4 x float> %33, %vec.phi201
+ %35 = add i32 %index, %mul15
+ %36 = getelementptr inbounds float, float* %2, i32 %35
+ %37 = bitcast float* %36 to <4 x float>*
+ %wide.masked.load212 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %37, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %38 = fmul fast <4 x float> %wide.masked.load212, %wide.masked.load
+ %39 = fadd fast <4 x float> %38, %vec.phi199
+ %40 = add i32 %index, %mul17
+ %41 = getelementptr inbounds float, float* %2, i32 %40
+ %42 = bitcast float* %41 to <4 x float>*
+ %wide.masked.load213 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %42, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
+ %43 = fmul fast <4 x float> %wide.masked.load213, %wide.masked.load
+ %44 = fadd fast <4 x float> %43, %vec.phi
+ %45 = select <4 x i1> %active.lane.mask, <4 x float> %44, <4 x float> %vec.phi
+ %46 = select <4 x i1> %active.lane.mask, <4 x float> %39, <4 x float> %vec.phi199
+ %47 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi200
+ %48 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi201
+ %49 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi202
+ %50 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi203
+ %51 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi204
+ %52 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi205
+ %index.next = add i32 %index, 4
+ %53 = icmp eq i32 %index.next, %n.vec
+ br i1 %53, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %52)
+ %55 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %51)
+ %56 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %50)
+ %57 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %49)
+ %58 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %48)
+ %59 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %47)
+ %60 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %46)
+ %61 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %45)
+ %arrayidx63 = getelementptr inbounds float, float* %pOut, i32 %k2.0198
+ store float %59, float* %arrayidx63, align 4
+ %arrayidx65 = getelementptr inbounds float, float* %pOut, i32 %add
+ store float %57, float* %arrayidx65, align 4
+ %arrayidx67 = getelementptr inbounds float, float* %pOut, i32 %add6
+ store float %55, float* %arrayidx67, align 4
+ %arrayidx69 = getelementptr inbounds float, float* %pOut, i32 %add8
+ store float %54, float* %arrayidx69, align 4
+ %arrayidx71 = getelementptr inbounds float, float* %pOut, i32 %add10
+ store float %56, float* %arrayidx71, align 4
+ %arrayidx73 = getelementptr inbounds float, float* %pOut, i32 %add12
+ store float %58, float* %arrayidx73, align 4
+ %arrayidx75 = getelementptr inbounds float, float* %pOut, i32 %add14
+ store float %60, float* %arrayidx75, align 4
+ %arrayidx77 = getelementptr inbounds float, float* %pOut, i32 %add16
+ store float %61, float* %arrayidx77, align 4
+ %add79 = add i32 %k2.0198, 8
+ %cmp3 = icmp ult i32 %add79, %sub
+ br i1 %cmp3, label %for.body, label %for.cond.cleanup
+}
+
+declare void @llvm.assume(i1 noundef)
+declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
+declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
diff --git a/llvm/test/Transforms/HardwareLoops/scalar-while.ll b/llvm/test/Transforms/HardwareLoops/scalar-while.ll
index 0b9847b33c71..acb9efd3b72b 100644
--- a/llvm/test/Transforms/HardwareLoops/scalar-while.ll
+++ b/llvm/test/Transforms/HardwareLoops/scalar-while.ll
@@ -1,33 +1,128 @@
-; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEC
-; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-REGDEC
-; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-nested-hardware-loop=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEC --check-prefix=CHECK-NESTED
-; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-guard=true -S %s -o - | FileCheck %s --check-prefix=CHECK-GUARD
-; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -force-hardware-loop-guard=true -S %s -o - | FileCheck %s --check-prefix=CHECK-GUARD
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -S %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-DEC
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -S %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-PHI
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-nested-hardware-loop=true -S %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NESTED
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-guard=true -S %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GUARD
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -force-hardware-loop-guard=true -S %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-PHIGUARD
-; CHECK-LABEL: while_lt
define void @while_lt(i32 %i, i32 %N, i32* nocapture %A) {
+; CHECK-DEC-LABEL: @while_lt(
+; CHECK-DEC-NEXT: entry:
+; CHECK-DEC-NEXT: [[CMP4:%.*]] = icmp ult i32 [[I:%.*]], [[N:%.*]]
+; CHECK-DEC-NEXT: br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-DEC: while.body.preheader:
+; CHECK-DEC-NEXT: [[TMP0:%.*]] = sub i32 [[N]], [[I]]
+; CHECK-DEC-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP0]])
+; CHECK-DEC-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-DEC: while.body:
+; CHECK-DEC-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-DEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-DEC-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-DEC-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-DEC-NEXT: [[TMP1:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-DEC-NEXT: br i1 [[TMP1]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-DEC: while.end:
+; CHECK-DEC-NEXT: ret void
+;
+; CHECK-PHI-LABEL: @while_lt(
+; CHECK-PHI-NEXT: entry:
+; CHECK-PHI-NEXT: [[CMP4:%.*]] = icmp ult i32 [[I:%.*]], [[N:%.*]]
+; CHECK-PHI-NEXT: br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-PHI: while.body.preheader:
+; CHECK-PHI-NEXT: [[TMP0:%.*]] = sub i32 [[N]], [[I]]
+; CHECK-PHI-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP0]])
+; CHECK-PHI-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-PHI: while.body:
+; CHECK-PHI-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-PHI-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-PHI-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-PHI-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-PHI-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
+; CHECK-PHI-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-PHI-NEXT: br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHI: while.end:
+; CHECK-PHI-NEXT: ret void
+;
+; CHECK-NESTED-LABEL: @while_lt(
+; CHECK-NESTED-NEXT: entry:
+; CHECK-NESTED-NEXT: [[CMP4:%.*]] = icmp ult i32 [[I:%.*]], [[N:%.*]]
+; CHECK-NESTED-NEXT: br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-NESTED: while.body.preheader:
+; CHECK-NESTED-NEXT: [[TMP0:%.*]] = sub i32 [[N]], [[I]]
+; CHECK-NESTED-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP0]])
+; CHECK-NESTED-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-NESTED: while.body:
+; CHECK-NESTED-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NESTED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-NESTED-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-NESTED-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-NESTED-NEXT: [[TMP1:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-NESTED-NEXT: br i1 [[TMP1]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-NESTED: while.end:
+; CHECK-NESTED-NEXT: ret void
+;
+; CHECK-GUARD-LABEL: @while_lt(
+; CHECK-GUARD-NEXT: entry:
+; CHECK-GUARD-NEXT: [[CMP4:%.*]] = icmp ult i32 [[I:%.*]], [[N:%.*]]
+; CHECK-GUARD-NEXT: [[TMP0:%.*]] = sub i32 [[N]], [[I]]
+; CHECK-GUARD-NEXT: br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-GUARD: while.body.preheader:
+; CHECK-GUARD-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP0]])
+; CHECK-GUARD-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-GUARD: while.body:
+; CHECK-GUARD-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-GUARD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-GUARD-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-GUARD-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-GUARD-NEXT: [[TMP1:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-GUARD-NEXT: br i1 [[TMP1]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-GUARD: while.end:
+; CHECK-GUARD-NEXT: ret void
+;
+; CHECK-PHIGUARD-LABEL: @while_lt(
+; CHECK-PHIGUARD-NEXT: entry:
+; CHECK-PHIGUARD-NEXT: [[CMP4:%.*]] = icmp ult i32 [[I:%.*]], [[N:%.*]]
+; CHECK-PHIGUARD-NEXT: [[TMP0:%.*]] = sub i32 [[N]], [[I]]
+; CHECK-PHIGUARD-NEXT: br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-PHIGUARD: while.body.preheader:
+; CHECK-PHIGUARD-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP0]])
+; CHECK-PHIGUARD-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-PHIGUARD: while.body:
+; CHECK-PHIGUARD-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-PHIGUARD-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHIGUARD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-PHIGUARD-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-PHIGUARD-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-PHIGUARD-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
+; CHECK-PHIGUARD-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-PHIGUARD-NEXT: br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHIGUARD: while.end:
+; CHECK-PHIGUARD-NEXT: ret void
+;
+; CHECK-REGDEC-LABEL: @while_lt(
+; CHECK-REGDEC-NEXT: entry:
+; CHECK-REGDEC-NEXT: [[CMP4:%.*]] = icmp ult i32 [[I:%.*]], [[N:%.*]]
+; CHECK-REGDEC-NEXT: br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-REGDEC: while.body.preheader:
+; CHECK-REGDEC-NEXT: [[TMP0:%.*]] = sub i32 [[N]], [[I]]
+; CHECK-REGDEC-NEXT: [[TMP1:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP0]])
+; CHECK-REGDEC-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-REGDEC: while.body:
+; CHECK-REGDEC-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-REGDEC-NEXT: [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP3:%.*]], [[WHILE_BODY]] ]
+; CHECK-REGDEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-REGDEC-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-REGDEC-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-REGDEC-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP2]], i32 1)
+; CHECK-REGDEC-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-REGDEC-NEXT: br i1 [[TMP4]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-REGDEC: while.end:
+; CHECK-REGDEC-NEXT: ret void
entry:
%cmp4 = icmp ult i32 %i, %N
br i1 %cmp4, label %while.body, label %while.end
-; CHECK-GUARD-LABEL: while_lt
-; CHECK-GUARD: [[COUNT:%[^ ]+]] = sub i32 %N, %i
-; CHECK-GUARD: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
-; CHECK-GUARD: br label %while.body
-
-; CHECK: while.body.preheader:
-; CHECK: [[COUNT:%[^ ]+]] = sub i32 %N, %i
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
-; CHECK: br label %while.body
-
-; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
-; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1)
-; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
-; CHECK-REGDEC: br i1 [[CMP]], label %while.body, label %while.end
-
-; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
-; CHECK-DEC: br i1 [[LOOP_DEC]], label %while.body, label %while.end
-
while.body:
%i.addr.05 = phi i32 [ %inc, %while.body ], [ %i, %entry ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05
@@ -40,21 +135,120 @@ while.end:
ret void
}
-; CHECK-LABEL: while_gt
-; CHECK: while.body.preheader:
-; CHECK: [[COUNT:%[^ ]+]] = sub i32 %i, %N
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
-; CHECK: br label %while.body
-
-; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
-; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1)
-; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
-; CHECK-REGDEC: br i1 [[CMP]], label %while.body, label %while.end
-
-; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
-; CHECK-DEC: br i1 [[LOOP_DEC]], label %while.body, label %while.end
-
define void @while_gt(i32 %i, i32 %N, i32* nocapture %A) {
+; CHECK-DEC-LABEL: @while_gt(
+; CHECK-DEC-NEXT: entry:
+; CHECK-DEC-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[I:%.*]], [[N:%.*]]
+; CHECK-DEC-NEXT: br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-DEC: while.body.preheader:
+; CHECK-DEC-NEXT: [[TMP0:%.*]] = sub i32 [[I]], [[N]]
+; CHECK-DEC-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP0]])
+; CHECK-DEC-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-DEC: while.body:
+; CHECK-DEC-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-DEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-DEC-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-DEC-NEXT: [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
+; CHECK-DEC-NEXT: [[TMP1:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-DEC-NEXT: br i1 [[TMP1]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-DEC: while.end:
+; CHECK-DEC-NEXT: ret void
+;
+; CHECK-PHI-LABEL: @while_gt(
+; CHECK-PHI-NEXT: entry:
+; CHECK-PHI-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[I:%.*]], [[N:%.*]]
+; CHECK-PHI-NEXT: br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-PHI: while.body.preheader:
+; CHECK-PHI-NEXT: [[TMP0:%.*]] = sub i32 [[I]], [[N]]
+; CHECK-PHI-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP0]])
+; CHECK-PHI-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-PHI: while.body:
+; CHECK-PHI-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-PHI-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-PHI-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-PHI-NEXT: [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
+; CHECK-PHI-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
+; CHECK-PHI-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-PHI-NEXT: br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHI: while.end:
+; CHECK-PHI-NEXT: ret void
+;
+; CHECK-NESTED-LABEL: @while_gt(
+; CHECK-NESTED-NEXT: entry:
+; CHECK-NESTED-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[I:%.*]], [[N:%.*]]
+; CHECK-NESTED-NEXT: br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-NESTED: while.body.preheader:
+; CHECK-NESTED-NEXT: [[TMP0:%.*]] = sub i32 [[I]], [[N]]
+; CHECK-NESTED-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP0]])
+; CHECK-NESTED-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-NESTED: while.body:
+; CHECK-NESTED-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NESTED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-NESTED-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-NESTED-NEXT: [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
+; CHECK-NESTED-NEXT: [[TMP1:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-NESTED-NEXT: br i1 [[TMP1]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-NESTED: while.end:
+; CHECK-NESTED-NEXT: ret void
+;
+; CHECK-GUARD-LABEL: @while_gt(
+; CHECK-GUARD-NEXT: entry:
+; CHECK-GUARD-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[I:%.*]], [[N:%.*]]
+; CHECK-GUARD-NEXT: [[TMP0:%.*]] = sub i32 [[I]], [[N]]
+; CHECK-GUARD-NEXT: br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-GUARD: while.body.preheader:
+; CHECK-GUARD-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP0]])
+; CHECK-GUARD-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-GUARD: while.body:
+; CHECK-GUARD-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-GUARD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-GUARD-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-GUARD-NEXT: [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
+; CHECK-GUARD-NEXT: [[TMP1:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-GUARD-NEXT: br i1 [[TMP1]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-GUARD: while.end:
+; CHECK-GUARD-NEXT: ret void
+;
+; CHECK-PHIGUARD-LABEL: @while_gt(
+; CHECK-PHIGUARD-NEXT: entry:
+; CHECK-PHIGUARD-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[I:%.*]], [[N:%.*]]
+; CHECK-PHIGUARD-NEXT: [[TMP0:%.*]] = sub i32 [[I]], [[N]]
+; CHECK-PHIGUARD-NEXT: br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-PHIGUARD: while.body.preheader:
+; CHECK-PHIGUARD-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP0]])
+; CHECK-PHIGUARD-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-PHIGUARD: while.body:
+; CHECK-PHIGUARD-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-PHIGUARD-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHIGUARD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-PHIGUARD-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-PHIGUARD-NEXT: [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
+; CHECK-PHIGUARD-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
+; CHECK-PHIGUARD-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-PHIGUARD-NEXT: br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHIGUARD: while.end:
+; CHECK-PHIGUARD-NEXT: ret void
+;
+; CHECK-REGDEC-LABEL: @while_gt(
+; CHECK-REGDEC-NEXT: entry:
+; CHECK-REGDEC-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[I:%.*]], [[N:%.*]]
+; CHECK-REGDEC-NEXT: br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-REGDEC: while.body.preheader:
+; CHECK-REGDEC-NEXT: [[TMP0:%.*]] = sub i32 [[I]], [[N]]
+; CHECK-REGDEC-NEXT: [[TMP1:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP0]])
+; CHECK-REGDEC-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-REGDEC: while.body:
+; CHECK-REGDEC-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-REGDEC-NEXT: [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP3:%.*]], [[WHILE_BODY]] ]
+; CHECK-REGDEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-REGDEC-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-REGDEC-NEXT: [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
+; CHECK-REGDEC-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP2]], i32 1)
+; CHECK-REGDEC-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-REGDEC-NEXT: br i1 [[TMP4]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-REGDEC: while.end:
+; CHECK-REGDEC-NEXT: ret void
entry:
%cmp4 = icmp sgt i32 %i, %N
br i1 %cmp4, label %while.body, label %while.end
@@ -71,31 +265,126 @@ while.end:
ret void
}
-; CHECK-GUARD-LABEL: while_gte
-; CHECK-GUARD: entry:
-; CHECK-GUARD: br i1 %cmp4, label %while.end, label %while.body.preheader
-; CHECK-GUARD: while.body.preheader:
-; CHECK-GUARD: [[ADD:%[^ ]+]] = add i32 %i, 1
-; CHECK-GUARD: [[COUNT:%[^ ]+]] = sub i32 [[ADD]], %N
-; CHECK-GUARD: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
-; CHECK-GUARD: br label %while.body
-
-; CHECK-LABEL: while_gte
-; CHECK: while.body.preheader:
-; CHECK: [[ADD:%[^ ]+]] = add i32 %i, 1
-; CHECK: [[COUNT:%[^ ]+]] = sub i32 [[ADD]], %N
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
-; CHECK: br label %while.body
-
-; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
-; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1)
-; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
-; CHECK-REGDEC: br i1 [[CMP]], label %while.body, label %while.end
-
-; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
-; CHECK-DEC: br i1 [[LOOP_DEC]], label %while.body, label %while.end
-
define void @while_gte(i32 %i, i32 %N, i32* nocapture %A) {
+; CHECK-DEC-LABEL: @while_gte(
+; CHECK-DEC-NEXT: entry:
+; CHECK-DEC-NEXT: [[CMP4:%.*]] = icmp slt i32 [[I:%.*]], [[N:%.*]]
+; CHECK-DEC-NEXT: br i1 [[CMP4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-DEC: while.body.preheader:
+; CHECK-DEC-NEXT: [[TMP0:%.*]] = add i32 [[I]], 1
+; CHECK-DEC-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[N]]
+; CHECK-DEC-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP1]])
+; CHECK-DEC-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-DEC: while.body:
+; CHECK-DEC-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-DEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-DEC-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-DEC-NEXT: [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
+; CHECK-DEC-NEXT: [[TMP2:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-DEC-NEXT: br i1 [[TMP2]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-DEC: while.end:
+; CHECK-DEC-NEXT: ret void
+;
+; CHECK-PHI-LABEL: @while_gte(
+; CHECK-PHI-NEXT: entry:
+; CHECK-PHI-NEXT: [[CMP4:%.*]] = icmp slt i32 [[I:%.*]], [[N:%.*]]
+; CHECK-PHI-NEXT: br i1 [[CMP4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-PHI: while.body.preheader:
+; CHECK-PHI-NEXT: [[TMP0:%.*]] = add i32 [[I]], 1
+; CHECK-PHI-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[N]]
+; CHECK-PHI-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP1]])
+; CHECK-PHI-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-PHI: while.body:
+; CHECK-PHI-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-PHI-NEXT: [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP3:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-PHI-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-PHI-NEXT: [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
+; CHECK-PHI-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP2]], i32 1)
+; CHECK-PHI-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-PHI-NEXT: br i1 [[TMP4]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHI: while.end:
+; CHECK-PHI-NEXT: ret void
+;
+; CHECK-NESTED-LABEL: @while_gte(
+; CHECK-NESTED-NEXT: entry:
+; CHECK-NESTED-NEXT: [[CMP4:%.*]] = icmp slt i32 [[I:%.*]], [[N:%.*]]
+; CHECK-NESTED-NEXT: br i1 [[CMP4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-NESTED: while.body.preheader:
+; CHECK-NESTED-NEXT: [[TMP0:%.*]] = add i32 [[I]], 1
+; CHECK-NESTED-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[N]]
+; CHECK-NESTED-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP1]])
+; CHECK-NESTED-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-NESTED: while.body:
+; CHECK-NESTED-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NESTED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-NESTED-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-NESTED-NEXT: [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
+; CHECK-NESTED-NEXT: [[TMP2:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-NESTED-NEXT: br i1 [[TMP2]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-NESTED: while.end:
+; CHECK-NESTED-NEXT: ret void
+;
+; CHECK-GUARD-LABEL: @while_gte(
+; CHECK-GUARD-NEXT: entry:
+; CHECK-GUARD-NEXT: [[CMP4:%.*]] = icmp slt i32 [[I:%.*]], [[N:%.*]]
+; CHECK-GUARD-NEXT: br i1 [[CMP4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-GUARD: while.body.preheader:
+; CHECK-GUARD-NEXT: [[TMP0:%.*]] = add i32 [[I]], 1
+; CHECK-GUARD-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[N]]
+; CHECK-GUARD-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP1]])
+; CHECK-GUARD-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-GUARD: while.body:
+; CHECK-GUARD-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-GUARD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-GUARD-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-GUARD-NEXT: [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
+; CHECK-GUARD-NEXT: [[TMP2:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-GUARD-NEXT: br i1 [[TMP2]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-GUARD: while.end:
+; CHECK-GUARD-NEXT: ret void
+;
+; CHECK-PHIGUARD-LABEL: @while_gte(
+; CHECK-PHIGUARD-NEXT: entry:
+; CHECK-PHIGUARD-NEXT: [[CMP4:%.*]] = icmp slt i32 [[I:%.*]], [[N:%.*]]
+; CHECK-PHIGUARD-NEXT: br i1 [[CMP4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-PHIGUARD: while.body.preheader:
+; CHECK-PHIGUARD-NEXT: [[TMP0:%.*]] = add i32 [[I]], 1
+; CHECK-PHIGUARD-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[N]]
+; CHECK-PHIGUARD-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP1]])
+; CHECK-PHIGUARD-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-PHIGUARD: while.body:
+; CHECK-PHIGUARD-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-PHIGUARD-NEXT: [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP3:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHIGUARD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-PHIGUARD-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-PHIGUARD-NEXT: [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
+; CHECK-PHIGUARD-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP2]], i32 1)
+; CHECK-PHIGUARD-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-PHIGUARD-NEXT: br i1 [[TMP4]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHIGUARD: while.end:
+; CHECK-PHIGUARD-NEXT: ret void
+;
+; CHECK-REGDEC-LABEL: @while_gte(
+; CHECK-REGDEC-NEXT: entry:
+; CHECK-REGDEC-NEXT: [[CMP4:%.*]] = icmp slt i32 [[I:%.*]], [[N:%.*]]
+; CHECK-REGDEC-NEXT: br i1 [[CMP4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-REGDEC: while.body.preheader:
+; CHECK-REGDEC-NEXT: [[TMP0:%.*]] = add i32 [[I]], 1
+; CHECK-REGDEC-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[N]]
+; CHECK-REGDEC-NEXT: [[TMP2:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP1]])
+; CHECK-REGDEC-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-REGDEC: while.body:
+; CHECK-REGDEC-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[I]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-REGDEC-NEXT: [[TMP3:%.*]] = phi i32 [ [[TMP2]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP4:%.*]], [[WHILE_BODY]] ]
+; CHECK-REGDEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-REGDEC-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-REGDEC-NEXT: [[DEC]] = add nsw i32 [[I_ADDR_05]], -1
+; CHECK-REGDEC-NEXT: [[TMP4]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1)
+; CHECK-REGDEC-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP4]], 0
+; CHECK-REGDEC-NEXT: br i1 [[TMP5]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-REGDEC: while.end:
+; CHECK-REGDEC-NEXT: ret void
entry:
%cmp4 = icmp slt i32 %i, %N
br i1 %cmp4, label %while.end, label %while.body
@@ -112,13 +401,114 @@ while.end:
ret void
}
-; CHECK-GUARD-LABEL: while_ne
-; CHECK-GUARD: entry:
-; CHECK-GUARD: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
-; CHECK-GUARD: br i1 [[TEST]], label %while.body.preheader, label %while.end
-; CHECK-GUARD: while.body.preheader:
-; CHECK-GUARD: br label %while.body
define void @while_ne(i32 %N, i32* nocapture %A) {
+; CHECK-DEC-LABEL: @while_ne(
+; CHECK-DEC-NEXT: entry:
+; CHECK-DEC-NEXT: [[CMP:%.*]] = icmp ne i32 [[N:%.*]], 0
+; CHECK-DEC-NEXT: br i1 [[CMP]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-DEC: while.body.preheader:
+; CHECK-DEC-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-DEC-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-DEC: while.body:
+; CHECK-DEC-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-DEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-DEC-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-DEC-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-DEC-NEXT: [[TMP0:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-DEC-NEXT: br i1 [[TMP0]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-DEC: while.end:
+; CHECK-DEC-NEXT: ret void
+;
+; CHECK-PHI-LABEL: @while_ne(
+; CHECK-PHI-NEXT: entry:
+; CHECK-PHI-NEXT: [[CMP:%.*]] = icmp ne i32 [[N:%.*]], 0
+; CHECK-PHI-NEXT: br i1 [[CMP]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-PHI: while.body.preheader:
+; CHECK-PHI-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-PHI-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-PHI: while.body:
+; CHECK-PHI-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-PHI-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP1:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-PHI-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-PHI-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-PHI-NEXT: [[TMP1]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
+; CHECK-PHI-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-PHI-NEXT: br i1 [[TMP2]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHI: while.end:
+; CHECK-PHI-NEXT: ret void
+;
+; CHECK-NESTED-LABEL: @while_ne(
+; CHECK-NESTED-NEXT: entry:
+; CHECK-NESTED-NEXT: [[CMP:%.*]] = icmp ne i32 [[N:%.*]], 0
+; CHECK-NESTED-NEXT: br i1 [[CMP]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-NESTED: while.body.preheader:
+; CHECK-NESTED-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-NESTED-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-NESTED: while.body:
+; CHECK-NESTED-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NESTED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-NESTED-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-NESTED-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-NESTED-NEXT: [[TMP0:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-NESTED-NEXT: br i1 [[TMP0]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-NESTED: while.end:
+; CHECK-NESTED-NEXT: ret void
+;
+; CHECK-GUARD-LABEL: @while_ne(
+; CHECK-GUARD-NEXT: entry:
+; CHECK-GUARD-NEXT: [[CMP:%.*]] = icmp ne i32 [[N:%.*]], 0
+; CHECK-GUARD-NEXT: [[TMP0:%.*]] = call i1 @llvm.test.set.loop.iterations.i32(i32 [[N]])
+; CHECK-GUARD-NEXT: br i1 [[TMP0]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-GUARD: while.body.preheader:
+; CHECK-GUARD-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-GUARD: while.body:
+; CHECK-GUARD-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-GUARD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-GUARD-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-GUARD-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-GUARD-NEXT: [[TMP1:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-GUARD-NEXT: br i1 [[TMP1]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-GUARD: while.end:
+; CHECK-GUARD-NEXT: ret void
+;
+; CHECK-PHIGUARD-LABEL: @while_ne(
+; CHECK-PHIGUARD-NEXT: entry:
+; CHECK-PHIGUARD-NEXT: [[CMP:%.*]] = icmp ne i32 [[N:%.*]], 0
+; CHECK-PHIGUARD-NEXT: [[TMP0:%.*]] = call i1 @llvm.test.set.loop.iterations.i32(i32 [[N]])
+; CHECK-PHIGUARD-NEXT: br i1 [[TMP0]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-PHIGUARD: while.body.preheader:
+; CHECK-PHIGUARD-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-PHIGUARD: while.body:
+; CHECK-PHIGUARD-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-PHIGUARD-NEXT: [[TMP1:%.*]] = phi i32 [ [[N]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHIGUARD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-PHIGUARD-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-PHIGUARD-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-PHIGUARD-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
+; CHECK-PHIGUARD-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-PHIGUARD-NEXT: br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHIGUARD: while.end:
+; CHECK-PHIGUARD-NEXT: ret void
+;
+; CHECK-REGDEC-LABEL: @while_ne(
+; CHECK-REGDEC-NEXT: entry:
+; CHECK-REGDEC-NEXT: [[CMP:%.*]] = icmp ne i32 [[N:%.*]], 0
+; CHECK-REGDEC-NEXT: br i1 [[CMP]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-REGDEC: while.body.preheader:
+; CHECK-REGDEC-NEXT: [[TMP0:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[N]])
+; CHECK-REGDEC-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-REGDEC: while.body:
+; CHECK-REGDEC-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-REGDEC-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
+; CHECK-REGDEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-REGDEC-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-REGDEC-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-REGDEC-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
+; CHECK-REGDEC-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-REGDEC-NEXT: br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-REGDEC: while.end:
+; CHECK-REGDEC-NEXT: ret void
entry:
%cmp = icmp ne i32 %N, 0
br i1 %cmp, label %while.body, label %while.end
@@ -135,13 +525,114 @@ while.end:
ret void
}
-; CHECK-GUARD-LABEL: while_eq
-; CHECK-GUARD: entry:
-; CHECK-GUARD: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
-; CHECK-GUARD: br i1 [[TEST]], label %while.body.preheader, label %while.end
-; CHECK-GUARD: while.body.preheader:
-; CHECK-GUARD: br label %while.body
define void @while_eq(i32 %N, i32* nocapture %A) {
+; CHECK-DEC-LABEL: @while_eq(
+; CHECK-DEC-NEXT: entry:
+; CHECK-DEC-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-DEC-NEXT: br i1 [[CMP]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-DEC: while.body.preheader:
+; CHECK-DEC-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-DEC-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-DEC: while.body:
+; CHECK-DEC-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-DEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-DEC-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-DEC-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-DEC-NEXT: [[TMP0:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-DEC-NEXT: br i1 [[TMP0]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-DEC: while.end:
+; CHECK-DEC-NEXT: ret void
+;
+; CHECK-PHI-LABEL: @while_eq(
+; CHECK-PHI-NEXT: entry:
+; CHECK-PHI-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-PHI-NEXT: br i1 [[CMP]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-PHI: while.body.preheader:
+; CHECK-PHI-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-PHI-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-PHI: while.body:
+; CHECK-PHI-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-PHI-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP1:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-PHI-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-PHI-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-PHI-NEXT: [[TMP1]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
+; CHECK-PHI-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-PHI-NEXT: br i1 [[TMP2]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHI: while.end:
+; CHECK-PHI-NEXT: ret void
+;
+; CHECK-NESTED-LABEL: @while_eq(
+; CHECK-NESTED-NEXT: entry:
+; CHECK-NESTED-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-NESTED-NEXT: br i1 [[CMP]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-NESTED: while.body.preheader:
+; CHECK-NESTED-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-NESTED-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-NESTED: while.body:
+; CHECK-NESTED-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NESTED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-NESTED-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-NESTED-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-NESTED-NEXT: [[TMP0:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-NESTED-NEXT: br i1 [[TMP0]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-NESTED: while.end:
+; CHECK-NESTED-NEXT: ret void
+;
+; CHECK-GUARD-LABEL: @while_eq(
+; CHECK-GUARD-NEXT: entry:
+; CHECK-GUARD-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-GUARD-NEXT: [[TMP0:%.*]] = call i1 @llvm.test.set.loop.iterations.i32(i32 [[N]])
+; CHECK-GUARD-NEXT: br i1 [[TMP0]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-GUARD: while.body.preheader:
+; CHECK-GUARD-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-GUARD: while.body:
+; CHECK-GUARD-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-GUARD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-GUARD-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-GUARD-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-GUARD-NEXT: [[TMP1:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-GUARD-NEXT: br i1 [[TMP1]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-GUARD: while.end:
+; CHECK-GUARD-NEXT: ret void
+;
+; CHECK-PHIGUARD-LABEL: @while_eq(
+; CHECK-PHIGUARD-NEXT: entry:
+; CHECK-PHIGUARD-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-PHIGUARD-NEXT: [[TMP0:%.*]] = call i1 @llvm.test.set.loop.iterations.i32(i32 [[N]])
+; CHECK-PHIGUARD-NEXT: br i1 [[TMP0]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-PHIGUARD: while.body.preheader:
+; CHECK-PHIGUARD-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-PHIGUARD: while.body:
+; CHECK-PHIGUARD-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-PHIGUARD-NEXT: [[TMP1:%.*]] = phi i32 [ [[N]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHIGUARD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-PHIGUARD-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-PHIGUARD-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-PHIGUARD-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
+; CHECK-PHIGUARD-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-PHIGUARD-NEXT: br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHIGUARD: while.end:
+; CHECK-PHIGUARD-NEXT: ret void
+;
+; CHECK-REGDEC-LABEL: @while_eq(
+; CHECK-REGDEC-NEXT: entry:
+; CHECK-REGDEC-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-REGDEC-NEXT: br i1 [[CMP]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-REGDEC: while.body.preheader:
+; CHECK-REGDEC-NEXT: [[TMP0:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[N]])
+; CHECK-REGDEC-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-REGDEC: while.body:
+; CHECK-REGDEC-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-REGDEC-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
+; CHECK-REGDEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-REGDEC-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-REGDEC-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-REGDEC-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
+; CHECK-REGDEC-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-REGDEC-NEXT: br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-REGDEC: while.end:
+; CHECK-REGDEC-NEXT: ret void
entry:
%cmp = icmp eq i32 %N, 0
br i1 %cmp, label %while.end, label %while.body
@@ -158,15 +649,126 @@ while.end:
ret void
}
-; CHECK-GUARD-LABEL: while_preheader_eq
-; CHECK-GUARD: entry:
-; CHECK-GUARD: br label %preheader
-; CHECK-GUARD: preheader:
-; CHECK-GUARD: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
-; CHECK-GUARD: br i1 [[TEST]], label %while.body.preheader, label %while.end
-; CHECK-GUARD: while.body.preheader:
-; CHECK-GUARD: br label %while.body
define void @while_preheader_eq(i32 %N, i32* nocapture %A) {
+; CHECK-DEC-LABEL: @while_preheader_eq(
+; CHECK-DEC-NEXT: entry:
+; CHECK-DEC-NEXT: br label [[PREHEADER:%.*]]
+; CHECK-DEC: preheader:
+; CHECK-DEC-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-DEC-NEXT: br i1 [[CMP]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-DEC: while.body.preheader:
+; CHECK-DEC-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-DEC-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-DEC: while.body:
+; CHECK-DEC-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-DEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-DEC-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-DEC-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-DEC-NEXT: [[TMP0:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-DEC-NEXT: br i1 [[TMP0]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-DEC: while.end:
+; CHECK-DEC-NEXT: ret void
+;
+; CHECK-PHI-LABEL: @while_preheader_eq(
+; CHECK-PHI-NEXT: entry:
+; CHECK-PHI-NEXT: br label [[PREHEADER:%.*]]
+; CHECK-PHI: preheader:
+; CHECK-PHI-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-PHI-NEXT: br i1 [[CMP]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-PHI: while.body.preheader:
+; CHECK-PHI-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-PHI-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-PHI: while.body:
+; CHECK-PHI-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-PHI-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP1:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-PHI-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-PHI-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-PHI-NEXT: [[TMP1]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
+; CHECK-PHI-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-PHI-NEXT: br i1 [[TMP2]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHI: while.end:
+; CHECK-PHI-NEXT: ret void
+;
+; CHECK-NESTED-LABEL: @while_preheader_eq(
+; CHECK-NESTED-NEXT: entry:
+; CHECK-NESTED-NEXT: br label [[PREHEADER:%.*]]
+; CHECK-NESTED: preheader:
+; CHECK-NESTED-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-NESTED-NEXT: br i1 [[CMP]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-NESTED: while.body.preheader:
+; CHECK-NESTED-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-NESTED-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-NESTED: while.body:
+; CHECK-NESTED-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NESTED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-NESTED-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-NESTED-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-NESTED-NEXT: [[TMP0:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-NESTED-NEXT: br i1 [[TMP0]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-NESTED: while.end:
+; CHECK-NESTED-NEXT: ret void
+;
+; CHECK-GUARD-LABEL: @while_preheader_eq(
+; CHECK-GUARD-NEXT: entry:
+; CHECK-GUARD-NEXT: br label [[PREHEADER:%.*]]
+; CHECK-GUARD: preheader:
+; CHECK-GUARD-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-GUARD-NEXT: [[TMP0:%.*]] = call i1 @llvm.test.set.loop.iterations.i32(i32 [[N]])
+; CHECK-GUARD-NEXT: br i1 [[TMP0]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-GUARD: while.body.preheader:
+; CHECK-GUARD-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-GUARD: while.body:
+; CHECK-GUARD-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-GUARD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-GUARD-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-GUARD-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-GUARD-NEXT: [[TMP1:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-GUARD-NEXT: br i1 [[TMP1]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-GUARD: while.end:
+; CHECK-GUARD-NEXT: ret void
+;
+; CHECK-PHIGUARD-LABEL: @while_preheader_eq(
+; CHECK-PHIGUARD-NEXT: entry:
+; CHECK-PHIGUARD-NEXT: br label [[PREHEADER:%.*]]
+; CHECK-PHIGUARD: preheader:
+; CHECK-PHIGUARD-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-PHIGUARD-NEXT: [[TMP0:%.*]] = call i1 @llvm.test.set.loop.iterations.i32(i32 [[N]])
+; CHECK-PHIGUARD-NEXT: br i1 [[TMP0]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
+; CHECK-PHIGUARD: while.body.preheader:
+; CHECK-PHIGUARD-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-PHIGUARD: while.body:
+; CHECK-PHIGUARD-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-PHIGUARD-NEXT: [[TMP1:%.*]] = phi i32 [ [[N]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
+; CHECK-PHIGUARD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-PHIGUARD-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-PHIGUARD-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-PHIGUARD-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
+; CHECK-PHIGUARD-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-PHIGUARD-NEXT: br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-PHIGUARD: while.end:
+; CHECK-PHIGUARD-NEXT: ret void
+;
+; CHECK-REGDEC-LABEL: @while_preheader_eq(
+; CHECK-REGDEC-NEXT: entry:
+; CHECK-REGDEC-NEXT: br label [[PREHEADER:%.*]]
+; CHECK-REGDEC: preheader:
+; CHECK-REGDEC-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-REGDEC-NEXT: br i1 [[CMP]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK-REGDEC: while.body.preheader:
+; CHECK-REGDEC-NEXT: [[TMP0:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[N]])
+; CHECK-REGDEC-NEXT: br label [[WHILE_BODY:%.*]]
+; CHECK-REGDEC: while.body:
+; CHECK-REGDEC-NEXT: [[I_ADDR_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-REGDEC-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP2:%.*]], [[WHILE_BODY]] ]
+; CHECK-REGDEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_ADDR_05]]
+; CHECK-REGDEC-NEXT: store i32 [[I_ADDR_05]], i32* [[ARRAYIDX]], align 4
+; CHECK-REGDEC-NEXT: [[INC]] = add nuw i32 [[I_ADDR_05]], 1
+; CHECK-REGDEC-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
+; CHECK-REGDEC-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-REGDEC-NEXT: br i1 [[TMP3]], label [[WHILE_BODY]], label [[WHILE_END]]
+; CHECK-REGDEC: while.end:
+; CHECK-REGDEC-NEXT: ret void
entry:
br label %preheader
@@ -186,28 +788,159 @@ while.end:
ret void
}
-; CHECK-LABEL: nested
-; CHECK-NESTED: call void @llvm.set.loop.iterations.i32(i32 %N)
-; CHECK-NESTED: br label %while.cond1.preheader.us
-
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
-; CHECK: br label %while.body3.us
-
-; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
-
-; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ %N, %while.cond1.preheader.us ], [ [[LOOP_DEC:%[^ ]+]], %while.body3.us ]
-; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[REM]], i32 1)
-; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
-; CHECK-REGDEC: br i1 [[CMP]], label %while.body3.us, label %while.cond1.while.end_crit_edge.us
-
-; CHECK-NESTED: [[LOOP_DEC1:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
-; CHECK-NESTED: br i1 [[LOOP_DEC1]], label %while.cond1.preheader.us, label %while.end7
-
-; CHECK-GUARD: while.cond1.preheader.us:
-; CHECK-GUARD: call void @llvm.set.loop.iterations.i32(i32 %N)
-; CHECK-GUARD: br label %while.body3.us
-
define void @nested(i32* nocapture %A, i32 %N) {
+; CHECK-DEC-LABEL: @nested(
+; CHECK-DEC-NEXT: entry:
+; CHECK-DEC-NEXT: [[CMP20:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-DEC-NEXT: br i1 [[CMP20]], label [[WHILE_END7:%.*]], label [[WHILE_COND1_PREHEADER_US:%.*]]
+; CHECK-DEC: while.cond1.preheader.us:
+; CHECK-DEC-NEXT: [[I_021_US:%.*]] = phi i32 [ [[INC6_US:%.*]], [[WHILE_COND1_WHILE_END_CRIT_EDGE_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-DEC-NEXT: [[MUL_US:%.*]] = mul i32 [[I_021_US]], [[N]]
+; CHECK-DEC-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-DEC-NEXT: br label [[WHILE_BODY3_US:%.*]]
+; CHECK-DEC: while.body3.us:
+; CHECK-DEC-NEXT: [[J_019_US:%.*]] = phi i32 [ 0, [[WHILE_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[WHILE_BODY3_US]] ]
+; CHECK-DEC-NEXT: [[ADD_US:%.*]] = add i32 [[J_019_US]], [[MUL_US]]
+; CHECK-DEC-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[ADD_US]]
+; CHECK-DEC-NEXT: store i32 [[ADD_US]], i32* [[ARRAYIDX_US]], align 4
+; CHECK-DEC-NEXT: [[INC_US]] = add nuw i32 [[J_019_US]], 1
+; CHECK-DEC-NEXT: [[TMP0:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-DEC-NEXT: br i1 [[TMP0]], label [[WHILE_BODY3_US]], label [[WHILE_COND1_WHILE_END_CRIT_EDGE_US]]
+; CHECK-DEC: while.cond1.while.end_crit_edge.us:
+; CHECK-DEC-NEXT: [[INC6_US]] = add nuw i32 [[I_021_US]], 1
+; CHECK-DEC-NEXT: [[EXITCOND23:%.*]] = icmp eq i32 [[INC6_US]], [[N]]
+; CHECK-DEC-NEXT: br i1 [[EXITCOND23]], label [[WHILE_END7]], label [[WHILE_COND1_PREHEADER_US]]
+; CHECK-DEC: while.end7:
+; CHECK-DEC-NEXT: ret void
+;
+; CHECK-PHI-LABEL: @nested(
+; CHECK-PHI-NEXT: entry:
+; CHECK-PHI-NEXT: [[CMP20:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-PHI-NEXT: br i1 [[CMP20]], label [[WHILE_END7:%.*]], label [[WHILE_COND1_PREHEADER_US:%.*]]
+; CHECK-PHI: while.cond1.preheader.us:
+; CHECK-PHI-NEXT: [[I_021_US:%.*]] = phi i32 [ [[INC6_US:%.*]], [[WHILE_COND1_WHILE_END_CRIT_EDGE_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-PHI-NEXT: [[MUL_US:%.*]] = mul i32 [[I_021_US]], [[N]]
+; CHECK-PHI-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-PHI-NEXT: br label [[WHILE_BODY3_US:%.*]]
+; CHECK-PHI: while.body3.us:
+; CHECK-PHI-NEXT: [[J_019_US:%.*]] = phi i32 [ 0, [[WHILE_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[WHILE_BODY3_US]] ]
+; CHECK-PHI-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[WHILE_COND1_PREHEADER_US]] ], [ [[TMP1:%.*]], [[WHILE_BODY3_US]] ]
+; CHECK-PHI-NEXT: [[ADD_US:%.*]] = add i32 [[J_019_US]], [[MUL_US]]
+; CHECK-PHI-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[ADD_US]]
+; CHECK-PHI-NEXT: store i32 [[ADD_US]], i32* [[ARRAYIDX_US]], align 4
+; CHECK-PHI-NEXT: [[INC_US]] = add nuw i32 [[J_019_US]], 1
+; CHECK-PHI-NEXT: [[TMP1]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
+; CHECK-PHI-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-PHI-NEXT: br i1 [[TMP2]], label [[WHILE_BODY3_US]], label [[WHILE_COND1_WHILE_END_CRIT_EDGE_US]]
+; CHECK-PHI: while.cond1.while.end_crit_edge.us:
+; CHECK-PHI-NEXT: [[INC6_US]] = add nuw i32 [[I_021_US]], 1
+; CHECK-PHI-NEXT: [[EXITCOND23:%.*]] = icmp eq i32 [[INC6_US]], [[N]]
+; CHECK-PHI-NEXT: br i1 [[EXITCOND23]], label [[WHILE_END7]], label [[WHILE_COND1_PREHEADER_US]]
+; CHECK-PHI: while.end7:
+; CHECK-PHI-NEXT: ret void
+;
+; CHECK-NESTED-LABEL: @nested(
+; CHECK-NESTED-NEXT: entry:
+; CHECK-NESTED-NEXT: [[CMP20:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-NESTED-NEXT: br i1 [[CMP20]], label [[WHILE_END7:%.*]], label [[WHILE_COND1_PREHEADER_US_PREHEADER:%.*]]
+; CHECK-NESTED: while.cond1.preheader.us.preheader:
+; CHECK-NESTED-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-NESTED-NEXT: br label [[WHILE_COND1_PREHEADER_US:%.*]]
+; CHECK-NESTED: while.cond1.preheader.us:
+; CHECK-NESTED-NEXT: [[I_021_US:%.*]] = phi i32 [ [[INC6_US:%.*]], [[WHILE_COND1_WHILE_END_CRIT_EDGE_US:%.*]] ], [ 0, [[WHILE_COND1_PREHEADER_US_PREHEADER]] ]
+; CHECK-NESTED-NEXT: [[MUL_US:%.*]] = mul i32 [[I_021_US]], [[N]]
+; CHECK-NESTED-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-NESTED-NEXT: br label [[WHILE_BODY3_US:%.*]]
+; CHECK-NESTED: while.body3.us:
+; CHECK-NESTED-NEXT: [[J_019_US:%.*]] = phi i32 [ 0, [[WHILE_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[WHILE_BODY3_US]] ]
+; CHECK-NESTED-NEXT: [[ADD_US:%.*]] = add i32 [[J_019_US]], [[MUL_US]]
+; CHECK-NESTED-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[ADD_US]]
+; CHECK-NESTED-NEXT: store i32 [[ADD_US]], i32* [[ARRAYIDX_US]], align 4
+; CHECK-NESTED-NEXT: [[INC_US]] = add nuw i32 [[J_019_US]], 1
+; CHECK-NESTED-NEXT: [[TMP0:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-NESTED-NEXT: br i1 [[TMP0]], label [[WHILE_BODY3_US]], label [[WHILE_COND1_WHILE_END_CRIT_EDGE_US]]
+; CHECK-NESTED: while.cond1.while.end_crit_edge.us:
+; CHECK-NESTED-NEXT: [[INC6_US]] = add nuw i32 [[I_021_US]], 1
+; CHECK-NESTED-NEXT: [[TMP1:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-NESTED-NEXT: br i1 [[TMP1]], label [[WHILE_COND1_PREHEADER_US]], label [[WHILE_END7]]
+; CHECK-NESTED: while.end7:
+; CHECK-NESTED-NEXT: ret void
+;
+; CHECK-GUARD-LABEL: @nested(
+; CHECK-GUARD-NEXT: entry:
+; CHECK-GUARD-NEXT: [[CMP20:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-GUARD-NEXT: br i1 [[CMP20]], label [[WHILE_END7:%.*]], label [[WHILE_COND1_PREHEADER_US:%.*]]
+; CHECK-GUARD: while.cond1.preheader.us:
+; CHECK-GUARD-NEXT: [[I_021_US:%.*]] = phi i32 [ [[INC6_US:%.*]], [[WHILE_COND1_WHILE_END_CRIT_EDGE_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-GUARD-NEXT: [[MUL_US:%.*]] = mul i32 [[I_021_US]], [[N]]
+; CHECK-GUARD-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-GUARD-NEXT: br label [[WHILE_BODY3_US:%.*]]
+; CHECK-GUARD: while.body3.us:
+; CHECK-GUARD-NEXT: [[J_019_US:%.*]] = phi i32 [ 0, [[WHILE_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[WHILE_BODY3_US]] ]
+; CHECK-GUARD-NEXT: [[ADD_US:%.*]] = add i32 [[J_019_US]], [[MUL_US]]
+; CHECK-GUARD-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[ADD_US]]
+; CHECK-GUARD-NEXT: store i32 [[ADD_US]], i32* [[ARRAYIDX_US]], align 4
+; CHECK-GUARD-NEXT: [[INC_US]] = add nuw i32 [[J_019_US]], 1
+; CHECK-GUARD-NEXT: [[TMP0:%.*]] = call i1 @llvm.loop.decrement.i32(i32 1)
+; CHECK-GUARD-NEXT: br i1 [[TMP0]], label [[WHILE_BODY3_US]], label [[WHILE_COND1_WHILE_END_CRIT_EDGE_US]]
+; CHECK-GUARD: while.cond1.while.end_crit_edge.us:
+; CHECK-GUARD-NEXT: [[INC6_US]] = add nuw i32 [[I_021_US]], 1
+; CHECK-GUARD-NEXT: [[EXITCOND23:%.*]] = icmp eq i32 [[INC6_US]], [[N]]
+; CHECK-GUARD-NEXT: br i1 [[EXITCOND23]], label [[WHILE_END7]], label [[WHILE_COND1_PREHEADER_US]]
+; CHECK-GUARD: while.end7:
+; CHECK-GUARD-NEXT: ret void
+;
+; CHECK-PHIGUARD-LABEL: @nested(
+; CHECK-PHIGUARD-NEXT: entry:
+; CHECK-PHIGUARD-NEXT: [[CMP20:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-PHIGUARD-NEXT: br i1 [[CMP20]], label [[WHILE_END7:%.*]], label [[WHILE_COND1_PREHEADER_US:%.*]]
+; CHECK-PHIGUARD: while.cond1.preheader.us:
+; CHECK-PHIGUARD-NEXT: [[I_021_US:%.*]] = phi i32 [ [[INC6_US:%.*]], [[WHILE_COND1_WHILE_END_CRIT_EDGE_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-PHIGUARD-NEXT: [[MUL_US:%.*]] = mul i32 [[I_021_US]], [[N]]
+; CHECK-PHIGUARD-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[N]])
+; CHECK-PHIGUARD-NEXT: br label [[WHILE_BODY3_US:%.*]]
+; CHECK-PHIGUARD: while.body3.us:
+; CHECK-PHIGUARD-NEXT: [[J_019_US:%.*]] = phi i32 [ 0, [[WHILE_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[WHILE_BODY3_US]] ]
+; CHECK-PHIGUARD-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[WHILE_COND1_PREHEADER_US]] ], [ [[TMP1:%.*]], [[WHILE_BODY3_US]] ]
+; CHECK-PHIGUARD-NEXT: [[ADD_US:%.*]] = add i32 [[J_019_US]], [[MUL_US]]
+; CHECK-PHIGUARD-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[ADD_US]]
+; CHECK-PHIGUARD-NEXT: store i32 [[ADD_US]], i32* [[ARRAYIDX_US]], align 4
+; CHECK-PHIGUARD-NEXT: [[INC_US]] = add nuw i32 [[J_019_US]], 1
+; CHECK-PHIGUARD-NEXT: [[TMP1]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
+; CHECK-PHIGUARD-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-PHIGUARD-NEXT: br i1 [[TMP2]], label [[WHILE_BODY3_US]], label [[WHILE_COND1_WHILE_END_CRIT_EDGE_US]]
+; CHECK-PHIGUARD: while.cond1.while.end_crit_edge.us:
+; CHECK-PHIGUARD-NEXT: [[INC6_US]] = add nuw i32 [[I_021_US]], 1
+; CHECK-PHIGUARD-NEXT: [[EXITCOND23:%.*]] = icmp eq i32 [[INC6_US]], [[N]]
+; CHECK-PHIGUARD-NEXT: br i1 [[EXITCOND23]], label [[WHILE_END7]], label [[WHILE_COND1_PREHEADER_US]]
+; CHECK-PHIGUARD: while.end7:
+; CHECK-PHIGUARD-NEXT: ret void
+;
+; CHECK-REGDEC-LABEL: @nested(
+; CHECK-REGDEC-NEXT: entry:
+; CHECK-REGDEC-NEXT: [[CMP20:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-REGDEC-NEXT: br i1 [[CMP20]], label [[WHILE_END7:%.*]], label [[WHILE_COND1_PREHEADER_US:%.*]]
+; CHECK-REGDEC: while.cond1.preheader.us:
+; CHECK-REGDEC-NEXT: [[I_021_US:%.*]] = phi i32 [ [[INC6_US:%.*]], [[WHILE_COND1_WHILE_END_CRIT_EDGE_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-REGDEC-NEXT: [[MUL_US:%.*]] = mul i32 [[I_021_US]], [[N]]
+; CHECK-REGDEC-NEXT: [[TMP0:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[N]])
+; CHECK-REGDEC-NEXT: br label [[WHILE_BODY3_US:%.*]]
+; CHECK-REGDEC: while.body3.us:
+; CHECK-REGDEC-NEXT: [[J_019_US:%.*]] = phi i32 [ 0, [[WHILE_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[WHILE_BODY3_US]] ]
+; CHECK-REGDEC-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[WHILE_BODY3_US]] ]
+; CHECK-REGDEC-NEXT: [[ADD_US:%.*]] = add i32 [[J_019_US]], [[MUL_US]]
+; CHECK-REGDEC-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[ADD_US]]
+; CHECK-REGDEC-NEXT: store i32 [[ADD_US]], i32* [[ARRAYIDX_US]], align 4
+; CHECK-REGDEC-NEXT: [[INC_US]] = add nuw i32 [[J_019_US]], 1
+; CHECK-REGDEC-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP1]], i32 1)
+; CHECK-REGDEC-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-REGDEC-NEXT: br i1 [[TMP3]], label [[WHILE_BODY3_US]], label [[WHILE_COND1_WHILE_END_CRIT_EDGE_US]]
+; CHECK-REGDEC: while.cond1.while.end_crit_edge.us:
+; CHECK-REGDEC-NEXT: [[INC6_US]] = add nuw i32 [[I_021_US]], 1
+; CHECK-REGDEC-NEXT: [[EXITCOND23:%.*]] = icmp eq i32 [[INC6_US]], [[N]]
+; CHECK-REGDEC-NEXT: br i1 [[EXITCOND23]], label [[WHILE_END7]], label [[WHILE_COND1_PREHEADER_US]]
+; CHECK-REGDEC: while.end7:
+; CHECK-REGDEC-NEXT: ret void
entry:
%cmp20 = icmp eq i32 %N, 0
br i1 %cmp20, label %while.end7, label %while.cond1.preheader.us
diff --git a/llvm/test/Transforms/LoopUnroll/ARM/mve-nounroll.ll b/llvm/test/Transforms/LoopUnroll/ARM/mve-nounroll.ll
index 15016c1f7e7d..316ad645799e 100644
--- a/llvm/test/Transforms/LoopUnroll/ARM/mve-nounroll.ll
+++ b/llvm/test/Transforms/LoopUnroll/ARM/mve-nounroll.ll
@@ -66,6 +66,78 @@ for.body: ; preds = %for.body.preheader1
}
+; Same as above but without the nounroll on the remainder loop. Neither loop should be unrolled.
+
+; CHECK-LABEL: @remainder
+; CHECK: vector.body:
+; CHECK: br i1 %13, label %middle.block, label %vector.body, !llvm.loop !0
+; CHECK: middle.block:
+; CHECK: br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader13
+; CHECK: for.body:
+; CHECK: br i1 %exitcond.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body, !llvm.loop !0
+; CHECK: for.body.prol.1:
+; CHECK: br i1 %prol.iter.cmp.1, label %for.body.prol.2, label %for.body.prol.loopexit.unr-lcssa
+; CHECK: for.body.prol.2:
+; CHECK: br label %for.body.prol.loopexit.unr-lcssa
+
+define void @remainder(float* %s1, float* %s2, float* %d, i32 %n) {
+entry:
+ %cmp10 = icmp sgt i32 %n, 0
+ br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %min.iters.check = icmp ult i32 %n, 4
+ br i1 %min.iters.check, label %for.body.preheader13, label %vector.ph
+
+for.body.preheader13: ; preds = %middle.block, %for.body.preheader
+ %i.011.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+ br label %for.body
+
+vector.ph: ; preds = %for.body.preheader
+ %n.vec = and i32 %n, -4
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds float, float* %s1, i32 %index
+ %1 = bitcast float* %0 to <4 x float>*
+ %wide.load = load <4 x float>, <4 x float>* %1, align 4
+ %2 = getelementptr inbounds float, float* %s2, i32 %index
+ %3 = bitcast float* %2 to <4 x float>*
+ %wide.load12 = load <4 x float>, <4 x float>* %3, align 4
+ %4 = fadd fast <4 x float> %wide.load12, %wide.load
+ %5 = getelementptr inbounds float, float* %d, i32 %index
+ %6 = bitcast float* %5 to <4 x float>*
+ store <4 x float> %4, <4 x float>* %6, align 4
+ %index.next = add i32 %index, 4
+ %7 = icmp eq i32 %index.next, %n.vec
+ br i1 %7, label %middle.block, label %vector.body, !llvm.loop !0
+
+middle.block: ; preds = %vector.body
+ %cmp.n = icmp eq i32 %n.vec, %n
+ br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader13
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %middle.block, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader13, %for.body
+ %i.011 = phi i32 [ %add3, %for.body ], [ %i.011.ph, %for.body.preheader13 ]
+ %arrayidx = getelementptr inbounds float, float* %s1, i32 %i.011
+ %8 = load float, float* %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds float, float* %s2, i32 %i.011
+ %9 = load float, float* %arrayidx1, align 4
+ %add = fadd fast float %9, %8
+ %arrayidx2 = getelementptr inbounds float, float* %d, i32 %i.011
+ store float %add, float* %arrayidx2, align 4
+ %add3 = add nuw nsw i32 %i.011, 1
+ %exitcond = icmp eq i32 %add3, %n
+ br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !0
+}
+
+
; CHECK-LABEL: @nested
; CHECK: for.outer:
More information about the llvm-commits
mailing list