[llvm] bd516d2 - [ARM] Move t2DoLoopStart reg alloc hint
David Green via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 11 09:56:39 PST 2021
Author: David Green
Date: 2021-03-11T17:56:19Z
New Revision: bd516d24c1127c98322507a36066fad2083a15e9
URL: https://github.com/llvm/llvm-project/commit/bd516d24c1127c98322507a36066fad2083a15e9
DIFF: https://github.com/llvm/llvm-project/commit/bd516d24c1127c98322507a36066fad2083a15e9.diff
LOG: [ARM] Move t2DoLoopStart reg alloc hint
This adjusts the place that the t2DoLoopStart reg allocation hint is
inserted, adding it in the ARMTPAndVPTOptimizaionPass in a similar place
as other tail predicated loop optimizations. This removes the need for
doing so in a custom inserter, and should make the hint more accurate,
only adding it where we expect to create a DLS (not DLSTP or WLS).
Added:
Modified:
llvm/lib/Target/ARM/ARMISelLowering.cpp
llvm/lib/Target/ARM/ARMInstrThumb2.td
llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 51cb35daa5df..802096e040df 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -11386,14 +11386,6 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return EmitLowered__chkstk(MI, BB);
case ARM::WIN__DBZCHK:
return EmitLowered__dbzchk(MI, BB);
- case ARM::t2DoLoopStart:
- // We are just here to set a register allocation hint, prefering lr for the
- // input register to make it more likely to be movable and removable, later
- // in the pipeline.
- Register R = MI.getOperand(1).getReg();
- MachineFunction *MF = MI.getParent()->getParent();
- MF->getRegInfo().setRegAllocationHint(R, ARMRI::RegLR, 0);
- return BB;
}
}
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index f562e8952f27..04d3bd8e5e12 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -5459,7 +5459,6 @@ let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB] in {
// t2DoLoopStart a pseudo for DLS hardware loops. Lowered into a DLS in
// ARMLowOverheadLoops if possible, or reverted to a Mov if not.
-let usesCustomInserter = 1 in
def t2DoLoopStart :
t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts), 4, IIC_Br,
[(set GPRlr:$X, (int_start_loop_iterations rGPR:$elts))]>;
diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
index c246ab3402bd..f21ea278ccd4 100644
--- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
@@ -75,6 +75,7 @@ class MVETPAndVPTOptimisations : public MachineFunctionPass {
bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB);
bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT);
bool ConvertVPSEL(MachineBasicBlock &MBB);
+ bool HintDoLoopStartReg(MachineBasicBlock &MBB);
};
char MVETPAndVPTOptimisations::ID = 0;
@@ -946,6 +947,21 @@ bool MVETPAndVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) {
return !DeadInstructions.empty();
}
+// Add a registry allocation hint for t2DoLoopStart to hint it towards LR, as
+// the instruction may be removable as a noop.
+bool MVETPAndVPTOptimisations::HintDoLoopStartReg(MachineBasicBlock &MBB) {
+ bool Changed = false;
+ for (MachineInstr &MI : MBB.instrs()) {
+ if (MI.getOpcode() != ARM::t2DoLoopStart)
+ continue;
+ Register R = MI.getOperand(1).getReg();
+ MachineFunction *MF = MI.getParent()->getParent();
+ MF->getRegInfo().setRegAllocationHint(R, ARMRI::RegLR, 0);
+ Changed = true;
+ }
+ return Changed;
+}
+
bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
const ARMSubtarget &STI =
static_cast<const ARMSubtarget &>(Fn.getSubtarget());
@@ -969,6 +985,7 @@ bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
}
for (MachineBasicBlock &MBB : Fn) {
+ Modified |= HintDoLoopStartReg(MBB);
Modified |= ReplaceConstByVPNOTs(MBB, DT);
Modified |= ReplaceVCMPsByVPNOTs(MBB);
Modified |= ReduceOldVCCRValueUses(MBB);
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
index 5f9838e3e634..faf4a2e5998c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@@ -219,8 +219,9 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: subs r3, #4
-; CHECK-NEXT: add.w lr, r12, r3, lsr #2
+; CHECK-NEXT: add.w r12, r12, r3, lsr #2
; CHECK-NEXT: movs r3, #0
+; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
index b1efc91cdee9..1098038d53e3 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
@@ -152,7 +152,7 @@ if.end: ; preds = %while.body, %while.
; CHECK: be_ne
; CHECK: body:
; CHECK: bb.0.entry:
-; CHECK: $lr = t2DLS killed renamable $r12
+; CHECK: $lr =
; CHECK: bb.2.do.body:
; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2
define void @be_ne(i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
index f70a5da8a0cc..db86cacded25 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
@@ -15,8 +15,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture re
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: add.w lr, r3, r12, lsr #2
+; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
+; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
@@ -90,8 +91,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: add.w lr, r3, r12, lsr #2
+; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
+; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
@@ -165,8 +167,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture r
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: add.w lr, r3, r12, lsr #2
+; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
+; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB2_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
@@ -240,8 +243,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocaptur
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: add.w lr, r3, r12, lsr #2
+; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
+; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB3_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
@@ -315,8 +319,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: add.w lr, r3, r12, lsr #2
+; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
+; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB4_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
index 3f8ec51ca84c..8e8c37bf7f52 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
@@ -73,7 +73,8 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_add_add_v8i16(i8* nocaptu
; CHECK-NEXT: bic r3, r3, #7
; CHECK-NEXT: sub.w r12, r3, #8
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: add.w lr, r3, r12, lsr #3
+; CHECK-NEXT: add.w r3, r3, r12, lsr #3
+; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.16 r2
@@ -145,7 +146,8 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_sub_add_v16i8(i8* nocaptur
; CHECK-NEXT: bic r3, r3, #15
; CHECK-NEXT: sub.w r12, r3, #16
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: add.w lr, r3, r12, lsr #4
+; CHECK-NEXT: add.w r3, r3, r12, lsr #4
+; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB2_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.8 r2
@@ -214,7 +216,8 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_sub_add_v8i16(i8* nocaptu
; CHECK-NEXT: bic r3, r3, #7
; CHECK-NEXT: sub.w r12, r3, #8
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: add.w lr, r3, r12, lsr #3
+; CHECK-NEXT: add.w r3, r3, r12, lsr #3
+; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB3_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.16 r2
@@ -285,7 +288,8 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_mul_add_v16i8(i8* nocaptur
; CHECK-NEXT: bic r3, r3, #15
; CHECK-NEXT: sub.w r12, r3, #16
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: add.w lr, r3, r12, lsr #4
+; CHECK-NEXT: add.w r3, r3, r12, lsr #4
+; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB4_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.8 r2
@@ -354,7 +358,8 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_mul_add_v8i16(i8* nocaptu
; CHECK-NEXT: bic r3, r3, #7
; CHECK-NEXT: sub.w r12, r3, #8
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: add.w lr, r3, r12, lsr #3
+; CHECK-NEXT: add.w r3, r3, r12, lsr #3
+; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB5_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.16 r2
@@ -413,7 +418,7 @@ for.cond.cleanup: ; preds = %middle.block, %entr
define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr {
; CHECK-LABEL: two_loops_mul_add_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: beq .LBB6_8
; CHECK-NEXT: @ %bb.1: @ %vector.ph
@@ -421,11 +426,12 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: mov r4, r0
-; CHECK-NEXT: subs r6, r3, #4
+; CHECK-NEXT: subs r7, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: mov r5, r1
-; CHECK-NEXT: add.w lr, r3, r6, lsr #2
+; CHECK-NEXT: add.w r6, r3, r7, lsr #2
; CHECK-NEXT: mov r3, r2
+; CHECK-NEXT: dls lr, r6
; CHECK-NEXT: .LBB6_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r3
@@ -445,8 +451,9 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vdup.32 q0, r3
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: add.w lr, r3, r6, lsr #2
+; CHECK-NEXT: add.w r3, r3, r7, lsr #2
; CHECK-NEXT: vmov.32 q0[0], r12
+; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB6_5: @ %vector.body46
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
@@ -463,10 +470,10 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read
; CHECK-NEXT: vaddv.u32 r12, q0
; CHECK-NEXT: .LBB6_7: @ %for.cond.cleanup7
; CHECK-NEXT: mov r0, r12
-; CHECK-NEXT: pop {r4, r5, r6, pc}
+; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
; CHECK-NEXT: .LBB6_8:
; CHECK-NEXT: movs r0, #0
-; CHECK-NEXT: pop {r4, r5, r6, pc}
+; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%cmp35 = icmp eq i32 %N, 0
br i1 %cmp35, label %for.cond.cleanup7, label %vector.ph
@@ -548,9 +555,10 @@ define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(i8* nocaptur
; CHECK-NEXT: movs r4, #1
; CHECK-NEXT: subs r3, #8
; CHECK-NEXT: vmov q3, q1
-; CHECK-NEXT: add.w lr, r4, r3, lsr #3
+; CHECK-NEXT: add.w r12, r4, r3, lsr #3
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: mov r4, r1
+; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB7_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.16 r2
@@ -668,7 +676,8 @@ define i32 @wrongop(%struct.date* nocapture readonly %pd) {
; CHECK-NEXT: adds r0, r2, #3
; CHECK-NEXT: bic r0, r0, #3
; CHECK-NEXT: subs r0, #4
-; CHECK-NEXT: add.w lr, r12, r0, lsr #2
+; CHECK-NEXT: add.w r0, r12, r0, lsr #2
+; CHECK-NEXT: dls lr, r0
; CHECK-NEXT: .LBB8_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
index 4b16066db794..d5a44e41e77f 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
@@ -57,8 +57,9 @@ define i32 @bad(i32* readonly %x, i32* nocapture readonly %y, i32 %n) {
; CHECK-NEXT: subs r3, r2, r3
; CHECK-NEXT: add.w r12, r3, #3
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: add.w lr, r3, r12, lsr #2
+; CHECK-NEXT: add.w r3, r3, r12, lsr #2
; CHECK-NEXT: mov.w r12, #0
+; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB1_1: @ %do.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
index 9deac6d8b3f7..1a370c483bad 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
@@ -15,8 +15,9 @@ define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* no
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: add.w lr, r3, r12, lsr #2
+; CHECK-NEXT: add.w r12, r3, r12, lsr #2
; CHECK-NEXT: movs r3, #0
+; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB0_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
@@ -91,8 +92,9 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i
; CHECK-NEXT: bic r1, r1, #3
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: subs r1, #4
-; CHECK-NEXT: add.w lr, r3, r1, lsr #2
+; CHECK-NEXT: add.w r3, r3, r1, lsr #2
; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
@@ -161,8 +163,9 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i
; CHECK-NEXT: bic r1, r1, #3
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: subs r1, #4
-; CHECK-NEXT: add.w lr, r3, r1, lsr #2
+; CHECK-NEXT: add.w r3, r3, r1, lsr #2
; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: .LBB2_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
index 0be5649b3441..cba123c9e2f2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
@@ -4,21 +4,22 @@
define dso_local void @mve_gather_qi_wb(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %n, i32 %m, i32 %l) {
; CHECK-LABEL: mve_gather_qi_wb:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r7, lr}
-; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: add.w r12, r0, r3, lsl #2
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: add.w r4, r0, r3, lsl #2
; CHECK-NEXT: adr r0, .LCPI0_0
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vmov.i32 q1, #0x0
-; CHECK-NEXT: movw lr, #1250
+; CHECK-NEXT: movw r12, #1250
; CHECK-NEXT: vadd.i32 q0, q0, r1
; CHECK-NEXT: adds r1, r3, #4
+; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB0_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r3
; CHECK-NEXT: vmov q2, q1
; CHECK-NEXT: vpstt
-; CHECK-NEXT: vldrwt.u32 q1, [r12], #16
+; CHECK-NEXT: vldrwt.u32 q1, [r4], #16
; CHECK-NEXT: vldrwt.u32 q3, [q0, #80]!
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: vmul.i32 q1, q3, q1
@@ -28,7 +29,7 @@ define dso_local void @mve_gather_qi_wb(i32* noalias nocapture readonly %A, i32*
; CHECK-NEXT: vpsel q0, q1, q2
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: str.w r0, [r2, r1, lsl #2]
-; CHECK-NEXT: pop {r7, pc}
+; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI0_0:
@@ -148,22 +149,23 @@ end: ; preds = %middle.block
define dso_local void @mve_scatter_qi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %n, i32 %m, i32 %l) {
; CHECK-LABEL: mve_scatter_qi:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r7, lr}
-; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: add.w r12, r0, r3, lsl #2
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: add.w r4, r0, r3, lsl #2
; CHECK-NEXT: adr r0, .LCPI2_0
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vmov.i32 q1, #0x0
-; CHECK-NEXT: movw lr, #1250
+; CHECK-NEXT: movw r12, #1250
; CHECK-NEXT: vmov.i32 q2, #0x3
; CHECK-NEXT: vadd.i32 q0, q0, r1
; CHECK-NEXT: adds r1, r3, #4
+; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: .LBB2_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r3
; CHECK-NEXT: vmov q3, q1
; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrwt.u32 q1, [r12], #16
+; CHECK-NEXT: vldrwt.u32 q1, [r4], #16
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: vmul.i32 q1, q1, q2
; CHECK-NEXT: vpst
@@ -174,7 +176,7 @@ define dso_local void @mve_scatter_qi(i32* noalias nocapture readonly %A, i32* n
; CHECK-NEXT: vpsel q0, q1, q3
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: str.w r0, [r2, r1, lsl #2]
-; CHECK-NEXT: pop {r7, pc}
+; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: .LCPI2_0:
@@ -236,34 +238,34 @@ define void @justoffsets(i8* noalias nocapture readonly %r, i8* noalias nocaptur
; CHECK-NEXT: adr r5, .LCPI3_3
; CHECK-NEXT: vstrw.32 q0, [sp, #176] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r6]
-; CHECK-NEXT: adr r4, .LCPI3_2
+; CHECK-NEXT: adr.w r8, .LCPI3_2
; CHECK-NEXT: vstrw.32 q0, [sp, #144] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r5]
-; CHECK-NEXT: adr.w r8, .LCPI3_1
+; CHECK-NEXT: adr.w lr, .LCPI3_1
; CHECK-NEXT: vstrw.32 q0, [sp, #128] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [r4]
-; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r8]
+; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q0, [lr]
; CHECK-NEXT: adr.w r12, .LCPI3_0
; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r12]
-; CHECK-NEXT: adr r3, .LCPI3_6
-; CHECK-NEXT: adr r6, .LCPI3_10
-; CHECK-NEXT: vldrw.u32 q1, [r3]
+; CHECK-NEXT: adr r7, .LCPI3_7
+; CHECK-NEXT: adr r5, .LCPI3_10
; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill
; CHECK-NEXT: vmov.i32 q0, #0x7fff
-; CHECK-NEXT: adr r3, .LCPI3_7
; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [r3]
-; CHECK-NEXT: adr r7, .LCPI3_9
-; CHECK-NEXT: adr r3, .LCPI3_8
+; CHECK-NEXT: vldrw.u32 q0, [r7]
+; CHECK-NEXT: adr r6, .LCPI3_9
+; CHECK-NEXT: adr r4, .LCPI3_6
; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q0, [r5]
+; CHECK-NEXT: adr r7, .LCPI3_8
+; CHECK-NEXT: vldrw.u32 q1, [r4]
+; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r6]
; CHECK-NEXT: vstrw.32 q1, [sp, #192] @ 16-byte Spill
-; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [r7]
; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [r3]
+; CHECK-NEXT: vldrw.u32 q0, [r7]
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB3_2: @ %vector.body
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
index e7a75935912d..bb3709ab517f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
@@ -728,16 +728,16 @@ define i8* @signext(i8* %input_row, i8* %input_col, i16 zeroext %output_ch, i16
; CHECK-NEXT: bge .LBB5_3
; CHECK-NEXT: @ %bb.6: @ %for.body24.preheader
; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1
-; CHECK-NEXT: ldr r2, [sp, #92]
+; CHECK-NEXT: ldr.w lr, [sp, #92]
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: mov r6, r10
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: mov r12, r10
-; CHECK-NEXT: mla r3, r9, r2, r0
+; CHECK-NEXT: mla r3, r9, lr, r0
; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: ldrd r7, r0, [sp] @ 8-byte Folded Reload
; CHECK-NEXT: mov r8, r10
-; CHECK-NEXT: dlstp.16 lr, r2
+; CHECK-NEXT: dlstp.16 lr, lr
; CHECK-NEXT: .LBB5_7: @ %for.body24
; CHECK-NEXT: @ Parent Loop BB5_5 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
@@ -901,16 +901,16 @@ define i8* @signext_optsize(i8* %input_row, i8* %input_col, i16 zeroext %output_
; CHECK-NEXT: bge .LBB6_6
; CHECK-NEXT: @ %bb.4: @ %for.body24.preheader
; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1
-; CHECK-NEXT: ldr r2, [sp, #92]
+; CHECK-NEXT: ldr.w lr, [sp, #92]
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: mov r6, r10
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: mov r12, r10
-; CHECK-NEXT: mla r3, r9, r2, r0
+; CHECK-NEXT: mla r3, r9, lr, r0
; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: ldrd r7, r0, [sp] @ 8-byte Folded Reload
; CHECK-NEXT: mov r8, r10
-; CHECK-NEXT: dlstp.16 lr, r2
+; CHECK-NEXT: dlstp.16 lr, lr
; CHECK-NEXT: .LBB6_5: @ %for.body24
; CHECK-NEXT: @ Parent Loop BB6_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
index 09362677650f..56f4acd78b4b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
@@ -11,8 +11,8 @@ define void @arm_min_helium_f32(float* %pSrc, i32 %blockSize, float* nocapture %
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: vidup.u32 q2, r6, #1
; CHECK-NEXT: adr r4, .LCPI0_0
-; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: vldrw.u32 q1, [r4]
+; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: vmov.i32 q3, #0x4
; CHECK-NEXT: dlstp.32 lr, r1
; CHECK-NEXT: .LBB0_1: @ %do.body
More information about the llvm-commits
mailing list