[llvm-branch-commits] [llvm] b0ce615 - [ARM] Remove copies from low overhead phi inductions.
David Green via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Dec 10 02:35:22 PST 2020
Author: David Green
Date: 2020-12-10T10:30:31Z
New Revision: b0ce615b2d29524b0b3541d07dd561665b710e79
URL: https://github.com/llvm/llvm-project/commit/b0ce615b2d29524b0b3541d07dd561665b710e79
DIFF: https://github.com/llvm/llvm-project/commit/b0ce615b2d29524b0b3541d07dd561665b710e79.diff
LOG: [ARM] Remove copies from low overhead phi inductions.
The phi created in a low overhead loop gets created with a default
register class it seems. There are then copied inserted between the low
overhead loop pseudo instructions (which produce/consume GPRlr
instructions) and the phi holding the induction. This patch removes
those as a step towards attempting to make t2LoopDec and t2LoopEnd a
single instruction, and appears useful in it's own right as shown in the
tests.
Differential Revision: https://reviews.llvm.org/D91267
Added:
Modified:
llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
llvm/test/CodeGen/Thumb2/LowOverheadLoops/count_dominates_start.mir
llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
index e56c4ce36f7b..20cb98072c9a 100644
--- a/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
@@ -59,7 +59,7 @@ class MVEVPTOptimisations : public MachineFunctionPass {
}
private:
- bool RevertLoopWithCall(MachineLoop *ML);
+ bool MergeLoopEnd(MachineLoop *ML);
bool ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT);
MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB,
MachineInstr &Instr,
@@ -159,8 +159,15 @@ static bool findLoopComponents(MachineLoop *ML, MachineRegisterInfo *MRI,
return true;
}
-bool MVEVPTOptimisations::RevertLoopWithCall(MachineLoop *ML) {
- LLVM_DEBUG(dbgs() << "RevertLoopWithCall on loop " << ML->getHeader()->getName()
+// This function converts loops with t2LoopEnd and t2LoopEnd instructions into
+// a single t2LoopEndDec instruction. To do that it needs to make sure that LR
+// will be valid to be used for the low overhead loop, which means nothing else
+// is using LR (especially calls) and there are no superfluous copies in the
+// loop. The t2LoopEndDec is a branching terminator that produces a value (the
+// decrement) around the loop edge, which means we need to be careful that they
+// will be valid to allocate without any spilling.
+bool MVEVPTOptimisations::MergeLoopEnd(MachineLoop *ML) {
+ LLVM_DEBUG(dbgs() << "MergeLoopEnd on loop " << ML->getHeader()->getName()
<< "\n");
MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
@@ -181,7 +188,58 @@ bool MVEVPTOptimisations::RevertLoopWithCall(MachineLoop *ML) {
}
}
- return false;
+ // Remove any copies from the loop, to ensure the phi that remains is both
+ // simpler and contains no extra uses. Because t2LoopEndDec is a terminator
+ // that cannot spill, we need to be careful what remains in the loop.
+ Register PhiReg = LoopPhi->getOperand(0).getReg();
+ Register DecReg = LoopDec->getOperand(0).getReg();
+ Register StartReg = LoopStart->getOperand(0).getReg();
+ // Ensure the uses are expected, and collect any copies we want to remove.
+ SmallVector<MachineInstr *, 4> Copies;
+ auto CheckUsers = [&Copies](Register BaseReg,
+ ArrayRef<MachineInstr *> ExpectedUsers,
+ MachineRegisterInfo *MRI) {
+ SmallVector<Register, 4> Worklist;
+ Worklist.push_back(BaseReg);
+ while (!Worklist.empty()) {
+ Register Reg = Worklist.pop_back_val();
+ for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
+ if (count(ExpectedUsers, &MI))
+ continue;
+ if (MI.getOpcode() != TargetOpcode::COPY ||
+ !MI.getOperand(0).getReg().isVirtual()) {
+ LLVM_DEBUG(dbgs() << "Extra users of register found: " << MI);
+ return false;
+ }
+ Worklist.push_back(MI.getOperand(0).getReg());
+ Copies.push_back(&MI);
+ }
+ }
+ return true;
+ };
+ if (!CheckUsers(PhiReg, {LoopDec}, MRI) ||
+ !CheckUsers(DecReg, {LoopPhi, LoopEnd}, MRI) ||
+ !CheckUsers(StartReg, {LoopPhi}, MRI))
+ return false;
+
+ MRI->constrainRegClass(StartReg, &ARM::GPRlrRegClass);
+ MRI->constrainRegClass(PhiReg, &ARM::GPRlrRegClass);
+ MRI->constrainRegClass(DecReg, &ARM::GPRlrRegClass);
+
+ if (LoopPhi->getOperand(2).getMBB() == ML->getLoopLatch()) {
+ LoopPhi->getOperand(3).setReg(StartReg);
+ LoopPhi->getOperand(1).setReg(DecReg);
+ } else {
+ LoopPhi->getOperand(1).setReg(StartReg);
+ LoopPhi->getOperand(3).setReg(DecReg);
+ }
+
+ LoopDec->getOperand(1).setReg(PhiReg);
+ LoopEnd->getOperand(0).setReg(DecReg);
+
+ for (auto *MI : Copies)
+ MI->eraseFromParent();
+ return true;
}
// Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP
@@ -787,7 +845,7 @@ bool MVEVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
bool Modified = false;
for (MachineLoop *ML : MLI->getBase().getLoopsInPreorder()) {
- Modified |= RevertLoopWithCall(ML);
+ Modified |= MergeLoopEnd(ML);
Modified |= ConvertTailPredLoop(ML, DT);
}
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/count_dominates_start.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/count_dominates_start.mir
index 627bf5d2e199..c07cdfcb2894 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/count_dominates_start.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/count_dominates_start.mir
@@ -123,33 +123,31 @@ body: |
; CHECK: [[t2MOVi1:%[0-9]+]]:rgpr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
; CHECK: [[t2ADDrs:%[0-9]+]]:gprnopc = nuw nsw t2ADDrs [[t2MOVi1]], [[t2SUBri]], 27, 14 /* CC::al */, $noreg, $noreg
; CHECK: [[COPY4:%[0-9]+]]:rgpr = COPY [[t2ADDrs]]
- ; CHECK: [[t2DoLoopStart:%[0-9]+]]:gprlr = t2DoLoopStart [[COPY4]]
; CHECK: [[t2MOVi2:%[0-9]+]]:rgpr = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
; CHECK: [[COPY5:%[0-9]+]]:gpr = COPY [[t2MOVi2]]
- ; CHECK: [[COPY6:%[0-9]+]]:gpr = COPY [[t2DoLoopStart]]
- ; CHECK: [[COPY7:%[0-9]+]]:gprnopc = COPY [[COPY]]
+ ; CHECK: [[COPY6:%[0-9]+]]:rgpr = COPY [[COPY]]
+ ; CHECK: [[t2DoLoopStartTP:%[0-9]+]]:gprlr = t2DoLoopStartTP [[COPY4]], [[COPY6]]
; CHECK: bb.3.vector.body:
; CHECK: successors: %bb.3(0x7c000000), %bb.4(0x04000000)
; CHECK: [[PHI:%[0-9]+]]:gprnopc = PHI [[COPY2]], %bb.2, %10, %bb.3
; CHECK: [[PHI1:%[0-9]+]]:gprnopc = PHI [[COPY1]], %bb.2, %9, %bb.3
; CHECK: [[PHI2:%[0-9]+]]:tgpreven = PHI [[COPY5]], %bb.2, %8, %bb.3
- ; CHECK: [[PHI3:%[0-9]+]]:gprlr = PHI [[COPY6]], %bb.2, %11, %bb.3
- ; CHECK: [[PHI4:%[0-9]+]]:rgpr = PHI [[COPY7]], %bb.2, %7, %bb.3
+ ; CHECK: [[PHI3:%[0-9]+]]:gprlr = PHI [[t2DoLoopStartTP]], %bb.2, %33, %bb.3
+ ; CHECK: [[PHI4:%[0-9]+]]:rgpr = PHI [[COPY6]], %bb.2, %7, %bb.3
; CHECK: [[MVE_VCTP16_:%[0-9]+]]:vccr = MVE_VCTP16 [[PHI4]], 0, $noreg
; CHECK: [[t2SUBri1:%[0-9]+]]:rgpr = t2SUBri [[PHI4]], 8, 14 /* CC::al */, $noreg, $noreg
- ; CHECK: [[COPY8:%[0-9]+]]:gpr = COPY [[t2SUBri1]]
+ ; CHECK: [[COPY7:%[0-9]+]]:gpr = COPY [[t2SUBri1]]
; CHECK: [[MVE_VLDRHU16_post:%[0-9]+]]:rgpr, [[MVE_VLDRHU16_post1:%[0-9]+]]:mqpr = MVE_VLDRHU16_post [[PHI]], 16, 1, [[MVE_VCTP16_]] :: (load 16 from %ir.lsr.iv35, align 2)
; CHECK: [[MVE_VLDRHU16_post2:%[0-9]+]]:rgpr, [[MVE_VLDRHU16_post3:%[0-9]+]]:mqpr = MVE_VLDRHU16_post [[PHI1]], 16, 1, [[MVE_VCTP16_]] :: (load 16 from %ir.lsr.iv12, align 2)
; CHECK: [[MVE_VMLADAVas16_:%[0-9]+]]:tgpreven = MVE_VMLADAVas16 [[PHI2]], killed [[MVE_VLDRHU16_post3]], killed [[MVE_VLDRHU16_post1]], 1, [[MVE_VCTP16_]]
- ; CHECK: [[COPY9:%[0-9]+]]:gpr = COPY [[MVE_VMLADAVas16_]]
- ; CHECK: [[COPY10:%[0-9]+]]:gpr = COPY [[MVE_VLDRHU16_post2]]
- ; CHECK: [[COPY11:%[0-9]+]]:gpr = COPY [[MVE_VLDRHU16_post]]
+ ; CHECK: [[COPY8:%[0-9]+]]:gpr = COPY [[MVE_VMLADAVas16_]]
+ ; CHECK: [[COPY9:%[0-9]+]]:gpr = COPY [[MVE_VLDRHU16_post2]]
+ ; CHECK: [[COPY10:%[0-9]+]]:gpr = COPY [[MVE_VLDRHU16_post]]
; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI3]], 1
- ; CHECK: [[COPY12:%[0-9]+]]:gpr = COPY [[t2LoopDec]]
; CHECK: t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def dead $cpsr
; CHECK: t2B %bb.4, 14 /* CC::al */, $noreg
; CHECK: bb.4.for.cond.cleanup:
- ; CHECK: [[PHI5:%[0-9]+]]:gpr = PHI [[COPY3]], %bb.1, [[COPY9]], %bb.3
+ ; CHECK: [[PHI5:%[0-9]+]]:gpr = PHI [[COPY3]], %bb.1, [[COPY8]], %bb.3
; CHECK: $r0 = COPY [[PHI5]]
; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $r0
bb.0.entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
index 77faf646b5cb..6f04ac838a52 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
@@ -323,15 +323,16 @@ for.cond.cleanup: ; preds = %vector.body, %entry
define arm_aapcs_vfpcc void @fmss2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) {
; CHECK-LABEL: fmss2:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, lr}
-; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: cmp r3, #1
-; CHECK-NEXT: blt .LBB5_3
-; CHECK-NEXT: @ %bb.1: @ %vector.ph
-; CHECK-NEXT: vmov r6, s0
-; CHECK-NEXT: vdup.32 q0, r6
-; CHECK-NEXT: mov.w r12, #0
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r4, pc}
+; CHECK-NEXT: .LBB5_1: @ %vector.ph
+; CHECK-NEXT: vmov lr, s0
+; CHECK-NEXT: vdup.32 q0, lr
; CHECK-NEXT: vneg.f32 q0, q0
+; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB5_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
@@ -342,8 +343,8 @@ define arm_aapcs_vfpcc void @fmss2(float* nocapture readonly %x, float* nocaptur
; CHECK-NEXT: vfma.f32 q3, q2, q1
; CHECK-NEXT: vstrw.32 q3, [r2], #16
; CHECK-NEXT: letp lr, .LBB5_2
-; CHECK-NEXT: .LBB5_3: @ %for.cond.cleanup
-; CHECK-NEXT: pop {r4, r5, r6, pc}
+; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
+; CHECK-NEXT: pop {r4, pc}
entry:
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
index c4e2b06f113e..43f452801013 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
@@ -157,10 +157,10 @@ define dso_local void @mve_scatter_qi(i32* noalias nocapture readonly %A, i32* n
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: movw lr, #1250
; CHECK-NEXT: vmov.i32 q1, #0x0
-; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: vmov.i32 q2, #0x3
; CHECK-NEXT: vadd.i32 q0, q0, r1
; CHECK-NEXT: adds r1, r3, #4
-; CHECK-NEXT: vmov.i32 q2, #0x3
+; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB2_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r3
@@ -231,65 +231,55 @@ define void @justoffsets(i8* noalias nocapture readonly %r, i8* noalias nocaptur
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: beq.w .LBB3_3
; CHECK-NEXT: @ %bb.1: @ %vector.ph
-; CHECK-NEXT: adds r3, r2, #3
-; CHECK-NEXT: adr r7, .LCPI3_5
-; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: vmov.i32 q0, #0x8000
-; CHECK-NEXT: sub.w r12, r3, #4
-; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: adr r6, .LCPI3_4
-; CHECK-NEXT: adr r5, .LCPI3_3
-; CHECK-NEXT: add.w lr, r3, r12, lsr #2
-; CHECK-NEXT: adr r4, .LCPI3_2
-; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: adr r7, .LCPI3_5
; CHECK-NEXT: vstrw.32 q0, [sp, #160] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r7]
-; CHECK-NEXT: adr.w r8, .LCPI3_1
-; CHECK-NEXT: adr.w r12, .LCPI3_0
-; CHECK-NEXT: adr r3, .LCPI3_6
+; CHECK-NEXT: adr r6, .LCPI3_4
+; CHECK-NEXT: adr r5, .LCPI3_3
; CHECK-NEXT: vstrw.32 q0, [sp, #176] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r6]
-; CHECK-NEXT: vldrw.u32 q1, [r3]
-; CHECK-NEXT: adr r3, .LCPI3_7
+; CHECK-NEXT: adr r4, .LCPI3_2
; CHECK-NEXT: vstrw.32 q0, [sp, #144] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r5]
-; CHECK-NEXT: adr r6, .LCPI3_10
-; CHECK-NEXT: adr r7, .LCPI3_9
+; CHECK-NEXT: adr.w r8, .LCPI3_1
; CHECK-NEXT: vstrw.32 q0, [sp, #128] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r4]
-; CHECK-NEXT: vstrw.32 q1, [sp, #192] @ 16-byte Spill
; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r8]
+; CHECK-NEXT: adr.w r12, .LCPI3_0
; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r12]
+; CHECK-NEXT: adr r3, .LCPI3_6
+; CHECK-NEXT: adr r6, .LCPI3_10
+; CHECK-NEXT: vldrw.u32 q1, [r3]
; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill
; CHECK-NEXT: vmov.i32 q0, #0x7fff
+; CHECK-NEXT: adr r3, .LCPI3_7
; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r3]
+; CHECK-NEXT: adr r7, .LCPI3_9
; CHECK-NEXT: adr r3, .LCPI3_8
; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r6]
+; CHECK-NEXT: vstrw.32 q1, [sp, #192] @ 16-byte Spill
; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r7]
; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r3]
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB3_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [sp, #192] @ 16-byte Reload
-; CHECK-NEXT: vctp.32 r2
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrbt.u32 q4, [r0, q0]
+; CHECK-NEXT: vldrb.u32 q4, [r0, q0]
; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrbt.u32 q7, [r0, q0]
+; CHECK-NEXT: vldrb.u32 q7, [r0, q0]
; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q5, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmul.i32 q6, q7, q0
; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrbt.u32 q1, [r0, q5]
+; CHECK-NEXT: vldrb.u32 q1, [r0, q5]
; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload
; CHECK-NEXT: vmul.i32 q3, q4, q0
; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload
@@ -320,14 +310,12 @@ define void @justoffsets(i8* noalias nocapture readonly %r, i8* noalias nocaptur
; CHECK-NEXT: vadd.i32 q1, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [sp, #192] @ 16-byte Reload
; CHECK-NEXT: vshr.u32 q1, q1, #16
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vstrbt.32 q1, [r1, q0]
+; CHECK-NEXT: vstrb.32 q1, [r1, q0]
; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload
-; CHECK-NEXT: vpstt
-; CHECK-NEXT: vstrbt.32 q2, [r1, q0]
-; CHECK-NEXT: vstrbt.32 q6, [r1, q5]
+; CHECK-NEXT: vstrb.32 q2, [r1, q0]
+; CHECK-NEXT: vstrb.32 q6, [r1, q5]
; CHECK-NEXT: adds r1, #12
-; CHECK-NEXT: le lr, .LBB3_2
+; CHECK-NEXT: letp lr, .LBB3_2
; CHECK-NEXT: .LBB3_3: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #216
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
index f3916102f9ea..8ac75b4f7277 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
@@ -9,32 +9,21 @@ define void @arm_min_helium_f32(float* %pSrc, i32 %blockSize, float* nocapture %
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: movs r6, #0
-; CHECK-NEXT: mov r12, r1
; CHECK-NEXT: vidup.u32 q2, r6, #1
-; CHECK-NEXT: cmp r1, #4
-; CHECK-NEXT: it ge
-; CHECK-NEXT: movge.w r12, #4
-; CHECK-NEXT: sub.w r6, r1, r12
-; CHECK-NEXT: adds r6, #3
-; CHECK-NEXT: mov.w lr, #1
; CHECK-NEXT: adr r4, .LCPI0_0
; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: add.w lr, lr, r6, lsr #2
; CHECK-NEXT: vldrw.u32 q1, [r4]
-; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: vmov.i32 q3, #0x4
+; CHECK-NEXT: dlstp.32 lr, r1
; CHECK-NEXT: mov r12, r1
; CHECK-NEXT: .LBB0_1: @ %do.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vctp.32 r12
-; CHECK-NEXT: sub.w r12, r12, #4
-; CHECK-NEXT: vpstttt
-; CHECK-NEXT: vldrwt.u32 q4, [r0], #16
-; CHECK-NEXT: vcmpt.f32 ge, q1, q4
+; CHECK-NEXT: vldrw.u32 q4, [r0], #16
+; CHECK-NEXT: vptt.f32 ge, q1, q4
; CHECK-NEXT: vmovt q1, q4
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vadd.i32 q2, q2, q3
-; CHECK-NEXT: le lr, .LBB0_1
+; CHECK-NEXT: letp lr, .LBB0_1
; CHECK-NEXT: @ %bb.2: @ %do.end
; CHECK-NEXT: vldr s8, .LCPI0_1
; CHECK-NEXT: vdup.32 q3, r1
More information about the llvm-branch-commits
mailing list