[llvm] 9e03547 - [ARM][HWLoops] Create hardware loops for sibling loops

David Green via llvm-commits llvm-commits at lists.llvm.org
Fri Jul 3 09:20:19 PDT 2020


Author: David Green
Date: 2020-07-03T17:20:02+01:00
New Revision: 9e03547cab691521ea3be9dab0b543156ce44c04

URL: https://github.com/llvm/llvm-project/commit/9e03547cab691521ea3be9dab0b543156ce44c04
DIFF: https://github.com/llvm/llvm-project/commit/9e03547cab691521ea3be9dab0b543156ce44c04.diff

LOG: [ARM][HWLoops] Create hardware loops for sibling loops

Given a loop with two subloops, it should be possible for both to be
converted to hardware loops. That's what this patch does, simply enough.
It slightly alters the loop iterating order to try and convert all
subloops. If one (or more) succeeds, it stops as before.

Differential Revision: https://reviews.llvm.org/D78502

Added: 
    

Modified: 
    llvm/lib/CodeGen/HardwareLoops.cpp
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
    llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
    llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
    llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp
index 3b97e7b97aa3..0ba7e920e507 100644
--- a/llvm/lib/CodeGen/HardwareLoops.cpp
+++ b/llvm/lib/CodeGen/HardwareLoops.cpp
@@ -245,14 +245,17 @@ bool HardwareLoops::runOnFunction(Function &F) {
 // converted and the parent loop doesn't support containing a hardware loop.
 bool HardwareLoops::TryConvertLoop(Loop *L) {
   // Process nested loops first.
-  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
-    if (TryConvertLoop(*I)) {
-      reportHWLoopFailure("nested hardware-loops not supported", "HWLoopNested",
-                          ORE, L);
-      return true; // Stop search.
-    }
+  bool AnyChanged = false;
+  for (Loop *SL : *L)
+    AnyChanged |= TryConvertLoop(SL);
+  if (AnyChanged) {
+    reportHWLoopFailure("nested hardware-loops not supported", "HWLoopNested",
+                        ORE, L);
+    return true; // Stop search.
   }
 
+  LLVM_DEBUG(dbgs() << "HWLoops: Loop " << L->getHeader()->getName() << "\n");
+
   HardwareLoopInfo HWLoopInfo(L);
   if (!HWLoopInfo.canAnalyze(*LI)) {
     reportHWLoopFailure("cannot analyze loop, irreducible control flow",

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
index 245694942ffb..69d370fc01a2 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
@@ -29,16 +29,16 @@ define arm_aapcs_vfpcc void @test(i16* noalias nocapture readonly %off, i16* noa
 ; CHECK-NEXT:  @ %bb.4: @ %for.body15.us.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    movs r5, #0
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB0_5: @ %for.body15.us
 ; CHECK-NEXT:    @ Parent Loop BB0_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrh.w r6, [r0, r5, lsl #1]
-; CHECK-NEXT:    ldrh.w r7, [r1, r5, lsl #1]
-; CHECK-NEXT:    add r7, r6
-; CHECK-NEXT:    strh.w r7, [r2, r5, lsl #1]
+; CHECK-NEXT:    ldrh.w r7, [r0, r5, lsl #1]
+; CHECK-NEXT:    ldrh.w r6, [r1, r5, lsl #1]
+; CHECK-NEXT:    add r6, r7
+; CHECK-NEXT:    strh.w r6, [r2, r5, lsl #1]
 ; CHECK-NEXT:    adds r5, #1
-; CHECK-NEXT:    cmp r3, r5
-; CHECK-NEXT:    bne .LBB0_5
+; CHECK-NEXT:    le lr, .LBB0_5
 ; CHECK-NEXT:  @ %bb.6: @ %for.cond.cleanup14.us
 ; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    add.w r8, r8, #1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index 62d2f178561b..42e8cc91ede8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -1059,10 +1059,9 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    .pad #28
-; CHECK-NEXT:    sub sp, #28
+; CHECK-NEXT:    .pad #24
+; CHECK-NEXT:    sub sp, #24
 ; CHECK-NEXT:    cmp r3, #8
-; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
 ; CHECK-NEXT:    blo.w .LBB16_12
 ; CHECK-NEXT:  @ %bb.1: @ %if.then
 ; CHECK-NEXT:    movs r7, #0
@@ -1070,73 +1069,73 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:    beq.w .LBB16_12
 ; CHECK-NEXT:  @ %bb.2: @ %while.body.lr.ph
 ; CHECK-NEXT:    ldrh r4, [r0]
-; CHECK-NEXT:    movs r1, #1
+; CHECK-NEXT:    lsr.w r10, r3, #2
 ; CHECK-NEXT:    ldrd r5, r12, [r0, #4]
-; CHECK-NEXT:    lsr.w r11, r3, #2
+; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    sub.w r0, r4, #8
-; CHECK-NEXT:    rsbs r3, r4, #0
+; CHECK-NEXT:    and r9, r0, #7
 ; CHECK-NEXT:    add.w r7, r0, r0, lsr #29
-; CHECK-NEXT:    and r0, r0, #7
 ; CHECK-NEXT:    asrs r6, r7, #3
 ; CHECK-NEXT:    cmp r6, #1
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    asrgt r1, r7, #3
+; CHECK-NEXT:    asrgt r3, r7, #3
 ; CHECK-NEXT:    add.w r7, r5, r4, lsl #1
-; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
-; CHECK-NEXT:    subs r1, r7, #2
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    add.w r3, r12, #16
-; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    adds r0, #1
-; CHECK-NEXT:    str r4, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
+; CHECK-NEXT:    subs r3, r7, #2
+; CHECK-NEXT:    str r3, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    rsbs r3, r4, #0
 ; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    add.w r3, r12, #16
+; CHECK-NEXT:    str r4, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB16_4
 ; CHECK-NEXT:  .LBB16_3: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    subs.w r11, r11, #1
+; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    subs.w r10, r10, #1
+; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #8
-; CHECK-NEXT:    add.w r0, r7, r0, lsl #1
+; CHECK-NEXT:    add.w r0, r5, r0, lsl #1
 ; CHECK-NEXT:    add.w r5, r0, #8
 ; CHECK-NEXT:    beq.w .LBB16_12
 ; CHECK-NEXT:  .LBB16_4: @ %while.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB16_6 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB16_10 Depth 2
-; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [r1], #8
 ; CHECK-NEXT:    ldrh.w lr, [r12, #14]
-; CHECK-NEXT:    vldrw.u32 q0, [r0], #8
-; CHECK-NEXT:    ldrh.w r10, [r12, #12]
-; CHECK-NEXT:    ldrh.w r7, [r12, #10]
-; CHECK-NEXT:    ldrh.w r4, [r12, #8]
+; CHECK-NEXT:    ldrh.w r0, [r12, #12]
+; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldrh.w r4, [r12, #10]
+; CHECK-NEXT:    ldrh.w r7, [r12, #8]
 ; CHECK-NEXT:    ldrh.w r3, [r12, #6]
 ; CHECK-NEXT:    ldrh.w r6, [r12, #4]
-; CHECK-NEXT:    ldrh.w r8, [r12, #2]
-; CHECK-NEXT:    ldrh.w r9, [r12]
+; CHECK-NEXT:    ldrh.w r11, [r12, #2]
+; CHECK-NEXT:    ldrh.w r8, [r12]
 ; CHECK-NEXT:    vstrb.8 q0, [r1], #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r5]
-; CHECK-NEXT:    str r0, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    adds r0, r5, #2
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmul.f16 q0, q0, r9
-; CHECK-NEXT:    adds r0, r5, #6
-; CHECK-NEXT:    vfma.f16 q0, q1, r8
+; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    adds r1, r5, #2
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
+; CHECK-NEXT:    vmul.f16 q0, q0, r8
+; CHECK-NEXT:    adds r1, r5, #6
+; CHECK-NEXT:    vfma.f16 q0, q1, r11
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #4]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r6
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    add.w r0, r5, #10
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
+; CHECK-NEXT:    add.w r1, r5, #10
 ; CHECK-NEXT:    vfma.f16 q0, q1, r3
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #8]
-; CHECK-NEXT:    vfma.f16 q0, q1, r4
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    add.w r0, r5, #14
 ; CHECK-NEXT:    vfma.f16 q0, q1, r7
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
+; CHECK-NEXT:    vfma.f16 q0, q1, r4
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #12]
-; CHECK-NEXT:    add.w r7, r5, #16
-; CHECK-NEXT:    vfma.f16 q0, q1, r10
+; CHECK-NEXT:    vfma.f16 q0, q1, r0
+; CHECK-NEXT:    add.w r0, r5, #14
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    adds r5, #16
 ; CHECK-NEXT:    vfma.f16 q0, q1, lr
 ; CHECK-NEXT:    cmp r0, #16
 ; CHECK-NEXT:    blo .LBB16_7
@@ -1144,69 +1143,68 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
 ; CHECK-NEXT:    ldr.w lr, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB16_6: @ %for.body
 ; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldrh r0, [r6]
-; CHECK-NEXT:    vldrw.u32 q1, [r7]
-; CHECK-NEXT:    adds r3, r7, #2
+; CHECK-NEXT:    vldrw.u32 q1, [r5]
+; CHECK-NEXT:    adds r1, r5, #2
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    vldrw.u32 q1, [r3]
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:    ldrh r0, [r6, #2]
-; CHECK-NEXT:    adds r3, r7, #6
+; CHECK-NEXT:    adds r1, r5, #6
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
 ; CHECK-NEXT:    ldrh r0, [r6, #4]
-; CHECK-NEXT:    vldrw.u32 q1, [r7, #4]
+; CHECK-NEXT:    vldrw.u32 q1, [r5, #4]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    vldrw.u32 q1, [r3]
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:    ldrh r0, [r6, #6]
-; CHECK-NEXT:    add.w r3, r7, #10
+; CHECK-NEXT:    add.w r1, r5, #10
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
 ; CHECK-NEXT:    ldrh r0, [r6, #8]
-; CHECK-NEXT:    vldrw.u32 q1, [r7, #8]
+; CHECK-NEXT:    vldrw.u32 q1, [r5, #8]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    vldrw.u32 q1, [r3]
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:    ldrh r0, [r6, #10]
-; CHECK-NEXT:    ldrh r3, [r6, #14]
+; CHECK-NEXT:    ldrh r1, [r6, #14]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
 ; CHECK-NEXT:    ldrh r0, [r6, #12]
-; CHECK-NEXT:    vldrw.u32 q1, [r7, #12]
+; CHECK-NEXT:    vldrw.u32 q1, [r5, #12]
 ; CHECK-NEXT:    adds r6, #16
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    add.w r0, r7, #14
+; CHECK-NEXT:    add.w r0, r5, #14
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    adds r7, #16
-; CHECK-NEXT:    vfma.f16 q0, q1, r3
+; CHECK-NEXT:    adds r5, #16
+; CHECK-NEXT:    vfma.f16 q0, q1, r1
 ; CHECK-NEXT:    le lr, .LBB16_6
 ; CHECK-NEXT:    b .LBB16_8
 ; CHECK-NEXT:  .LBB16_7: @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB16_8: @ %for.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cmp.w r9, #0
 ; CHECK-NEXT:    beq.w .LBB16_3
-; CHECK-NEXT:  @ %bb.9: @ %while.body76.preheader
+; CHECK-NEXT:    b .LBB16_9
+; CHECK-NEXT:  .LBB16_9: @ %while.body76.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    mov r5, r7
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov lr, r9
 ; CHECK-NEXT:  .LBB16_10: @ %while.body76
 ; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrh r3, [r6], #2
-; CHECK-NEXT:    vldrh.u16 q1, [r5], #2
-; CHECK-NEXT:    subs r0, #1
-; CHECK-NEXT:    vfma.f16 q0, q1, r3
-; CHECK-NEXT:    cmp r0, #1
-; CHECK-NEXT:    bgt .LBB16_10
-; CHECK-NEXT:  @ %bb.11: @ %while.end.loopexit
+; CHECK-NEXT:    ldrh r1, [r6], #2
+; CHECK-NEXT:    vldrh.u16 q1, [r0], #2
+; CHECK-NEXT:    subs.w lr, lr, #1
+; CHECK-NEXT:    vfma.f16 q0, q1, r1
+; CHECK-NEXT:    bne .LBB16_10
+; CHECK-NEXT:    b .LBB16_11
+; CHECK-NEXT:  .LBB16_11: @ %while.end.loopexit
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    add.w r7, r7, r0, lsl #1
+; CHECK-NEXT:    add.w r5, r5, r9, lsl #1
 ; CHECK-NEXT:    b .LBB16_3
 ; CHECK-NEXT:  .LBB16_12: @ %if.end
-; CHECK-NEXT:    add sp, #28
+; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
   %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 6ebddac98712..bf6b57f223a4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1040,121 +1040,121 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    cmp.w r7, r3, lsr #2
 ; CHECK-NEXT:    beq.w .LBB16_12
 ; CHECK-NEXT:  @ %bb.2: @ %while.body.lr.ph
-; CHECK-NEXT:    ldrh r5, [r0]
-; CHECK-NEXT:    lsr.w r9, r3, #2
-; CHECK-NEXT:    ldrd r8, r12, [r0, #4]
+; CHECK-NEXT:    ldrh r4, [r0]
+; CHECK-NEXT:    lsr.w r10, r3, #2
+; CHECK-NEXT:    ldrd r5, r12, [r0, #4]
 ; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    sub.w r0, r5, #8
-; CHECK-NEXT:    add.w r7, r0, r0, lsr #29
-; CHECK-NEXT:    and r0, r0, #7
-; CHECK-NEXT:    asrs r6, r7, #3
+; CHECK-NEXT:    sub.w r7, r4, #8
+; CHECK-NEXT:    add.w r0, r7, r7, lsr #29
+; CHECK-NEXT:    asrs r6, r0, #3
 ; CHECK-NEXT:    cmp r6, #1
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    asrgt r3, r7, #3
-; CHECK-NEXT:    add.w r7, r8, r5, lsl #2
-; CHECK-NEXT:    sub.w r11, r7, #4
-; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
-; CHECK-NEXT:    rsbs r3, r5, #0
+; CHECK-NEXT:    asrgt r3, r0, #3
+; CHECK-NEXT:    add.w r0, r5, r4, lsl #2
+; CHECK-NEXT:    sub.w r9, r0, #4
+; CHECK-NEXT:    rsbs r0, r4, #0
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    and r3, r7, #7
+; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r12, #32
+; CHECK-NEXT:    str r4, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    add.w r3, r12, #32
-; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    adds r0, #1
-; CHECK-NEXT:    str r5, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB16_4
 ; CHECK-NEXT:  .LBB16_3: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    subs.w r9, r9, #1
-; CHECK-NEXT:    ldrd r11, r1, [sp, #24] @ 8-byte Folded Reload
+; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    subs.w r10, r10, #1
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
-; CHECK-NEXT:    add.w r0, r8, r0, lsl #2
-; CHECK-NEXT:    add.w r8, r0, #16
-; CHECK-NEXT:    beq .LBB16_12
+; CHECK-NEXT:    add.w r0, r5, r0, lsl #2
+; CHECK-NEXT:    add.w r5, r0, #16
+; CHECK-NEXT:    beq.w .LBB16_12
 ; CHECK-NEXT:  .LBB16_4: @ %while.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB16_6 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB16_10 Depth 2
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
-; CHECK-NEXT:    ldrd r0, r7, [r12]
-; CHECK-NEXT:    ldrd r4, r6, [r12, #8]
-; CHECK-NEXT:    ldrd r5, r3, [r12, #16]
-; CHECK-NEXT:    ldrd lr, r10, [r12, #24]
-; CHECK-NEXT:    vstrb.8 q0, [r11], #16
-; CHECK-NEXT:    vldrw.u32 q0, [r8], #32
-; CHECK-NEXT:    vldrw.u32 q1, [r8, #-28]
-; CHECK-NEXT:    vmul.f32 q0, q0, r0
-; CHECK-NEXT:    vldrw.u32 q6, [r8, #-24]
-; CHECK-NEXT:    vldrw.u32 q4, [r8, #-20]
-; CHECK-NEXT:    vfma.f32 q0, q1, r7
-; CHECK-NEXT:    vldrw.u32 q5, [r8, #-16]
-; CHECK-NEXT:    vfma.f32 q0, q6, r4
-; CHECK-NEXT:    vldrw.u32 q2, [r8, #-12]
+; CHECK-NEXT:    ldrd r7, r4, [r12]
+; CHECK-NEXT:    ldrd r0, r6, [r12, #8]
+; CHECK-NEXT:    ldrd r3, lr, [r12, #16]
+; CHECK-NEXT:    ldrd r11, r8, [r12, #24]
+; CHECK-NEXT:    vstrb.8 q0, [r9], #16
+; CHECK-NEXT:    vldrw.u32 q0, [r5], #32
+; CHECK-NEXT:    vldrw.u32 q1, [r5, #-28]
+; CHECK-NEXT:    vmul.f32 q0, q0, r7
+; CHECK-NEXT:    vldrw.u32 q6, [r5, #-24]
+; CHECK-NEXT:    vldrw.u32 q4, [r5, #-20]
+; CHECK-NEXT:    vfma.f32 q0, q1, r4
+; CHECK-NEXT:    vldrw.u32 q5, [r5, #-16]
+; CHECK-NEXT:    vfma.f32 q0, q6, r0
+; CHECK-NEXT:    vldrw.u32 q2, [r5, #-12]
 ; CHECK-NEXT:    vfma.f32 q0, q4, r6
-; CHECK-NEXT:    vldrw.u32 q3, [r8, #-8]
-; CHECK-NEXT:    vfma.f32 q0, q5, r5
-; CHECK-NEXT:    vldrw.u32 q1, [r8, #-4]
-; CHECK-NEXT:    vfma.f32 q0, q2, r3
-; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    vfma.f32 q0, q3, lr
-; CHECK-NEXT:    strd r11, r1, [sp, #24] @ 8-byte Folded Spill
-; CHECK-NEXT:    vfma.f32 q0, q1, r10
+; CHECK-NEXT:    vldrw.u32 q3, [r5, #-8]
+; CHECK-NEXT:    vfma.f32 q0, q5, r3
+; CHECK-NEXT:    vldrw.u32 q1, [r5, #-4]
+; CHECK-NEXT:    vfma.f32 q0, q2, lr
+; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    vfma.f32 q0, q3, r11
+; CHECK-NEXT:    strd r9, r1, [sp, #24] @ 8-byte Folded Spill
+; CHECK-NEXT:    vfma.f32 q0, q1, r8
 ; CHECK-NEXT:    cmp r0, #16
 ; CHECK-NEXT:    blo .LBB16_7
 ; CHECK-NEXT:  @ %bb.5: @ %for.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    ldr.w lr, [sp] @ 4-byte Reload
+; CHECK-NEXT:    ldr.w lr, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB16_6: @ %for.body
 ; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldm.w r7, {r0, r3, r4, r5, r6, r10, r11}
-; CHECK-NEXT:    vldrw.u32 q1, [r8], #32
-; CHECK-NEXT:    vldrw.u32 q6, [r8, #-24]
-; CHECK-NEXT:    vldrw.u32 q4, [r8, #-20]
+; CHECK-NEXT:    ldm.w r7, {r0, r3, r4, r6}
+; CHECK-NEXT:    vldrw.u32 q1, [r5], #32
+; CHECK-NEXT:    add.w r11, r7, #16
+; CHECK-NEXT:    vldrw.u32 q6, [r5, #-24]
+; CHECK-NEXT:    vldrw.u32 q4, [r5, #-20]
 ; CHECK-NEXT:    vfma.f32 q0, q1, r0
-; CHECK-NEXT:    vldrw.u32 q1, [r8, #-28]
-; CHECK-NEXT:    vldrw.u32 q5, [r8, #-16]
-; CHECK-NEXT:    vldrw.u32 q2, [r8, #-12]
+; CHECK-NEXT:    vldrw.u32 q1, [r5, #-28]
+; CHECK-NEXT:    ldm.w r11, {r1, r8, r11}
+; CHECK-NEXT:    vldrw.u32 q5, [r5, #-16]
 ; CHECK-NEXT:    vfma.f32 q0, q1, r3
-; CHECK-NEXT:    vldrw.u32 q3, [r8, #-8]
+; CHECK-NEXT:    vldrw.u32 q2, [r5, #-12]
 ; CHECK-NEXT:    vfma.f32 q0, q6, r4
-; CHECK-NEXT:    ldr r1, [r7, #28]
-; CHECK-NEXT:    vfma.f32 q0, q4, r5
-; CHECK-NEXT:    vldrw.u32 q1, [r8, #-4]
-; CHECK-NEXT:    vfma.f32 q0, q5, r6
+; CHECK-NEXT:    vldrw.u32 q3, [r5, #-8]
+; CHECK-NEXT:    vfma.f32 q0, q4, r6
+; CHECK-NEXT:    ldr.w r9, [r7, #28]
+; CHECK-NEXT:    vfma.f32 q0, q5, r1
+; CHECK-NEXT:    vldrw.u32 q1, [r5, #-4]
+; CHECK-NEXT:    vfma.f32 q0, q2, r8
 ; CHECK-NEXT:    adds r7, #32
-; CHECK-NEXT:    vfma.f32 q0, q2, r10
 ; CHECK-NEXT:    vfma.f32 q0, q3, r11
-; CHECK-NEXT:    vfma.f32 q0, q1, r1
+; CHECK-NEXT:    vfma.f32 q0, q1, r9
 ; CHECK-NEXT:    le lr, .LBB16_6
 ; CHECK-NEXT:    b .LBB16_8
 ; CHECK-NEXT:  .LBB16_7: @ in Loop: Header=BB16_4 Depth=1
 ; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB16_8: @ %for.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    ldrd r9, r1, [sp, #24] @ 8-byte Folded Reload
+; CHECK-NEXT:    ldr r3, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    cmp.w r3, #0
 ; CHECK-NEXT:    beq .LBB16_3
-; CHECK-NEXT:  @ %bb.9: @ %while.body76.preheader
+; CHECK-NEXT:    b .LBB16_9
+; CHECK-NEXT:  .LBB16_9: @ %while.body76.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    mov r4, r8
+; CHECK-NEXT:    mov r4, r5
+; CHECK-NEXT:    mov lr, r3
 ; CHECK-NEXT:  .LBB16_10: @ %while.body76
 ; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldr r1, [r7], #4
+; CHECK-NEXT:    ldr r0, [r7], #4
 ; CHECK-NEXT:    vldrw.u32 q1, [r4], #4
-; CHECK-NEXT:    subs r0, #1
-; CHECK-NEXT:    vfma.f32 q0, q1, r1
-; CHECK-NEXT:    cmp r0, #1
-; CHECK-NEXT:    bgt .LBB16_10
-; CHECK-NEXT:  @ %bb.11: @ %while.end.loopexit
+; CHECK-NEXT:    subs.w lr, lr, #1
+; CHECK-NEXT:    vfma.f32 q0, q1, r0
+; CHECK-NEXT:    bne .LBB16_10
+; CHECK-NEXT:    b .LBB16_11
+; CHECK-NEXT:  .LBB16_11: @ %while.end.loopexit
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    add.w r8, r8, r0, lsl #2
+; CHECK-NEXT:    add.w r5, r5, r3, lsl #2
 ; CHECK-NEXT:    b .LBB16_3
 ; CHECK-NEXT:  .LBB16_12: @ %if.end
 ; CHECK-NEXT:    add sp, #32

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index 365f5e8b3a6a..fceda29f252a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -594,57 +594,56 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    strd r0, r2, [sp, #24] @ 8-byte Folded Spill
+; CHECK-NEXT:    .pad #24
+; CHECK-NEXT:    sub sp, #24
+; CHECK-NEXT:    strd r0, r2, [sp, #16] @ 8-byte Folded Spill
 ; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    mov r0, r3
 ; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    ldrne.w lr, [sp, #120]
-; CHECK-NEXT:    cmpne.w lr, #0
+; CHECK-NEXT:    ldrne r0, [sp, #112]
+; CHECK-NEXT:    cmpne r0, #0
 ; CHECK-NEXT:    bne .LBB10_2
 ; CHECK-NEXT:  .LBB10_1: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:  .LBB10_2: @ %for.cond1.preheader.us.preheader
-; CHECK-NEXT:    ldr.w r11, [sp, #124]
+; CHECK-NEXT:    ldr.w r11, [sp, #116]
 ; CHECK-NEXT:    mov r6, r1
 ; CHECK-NEXT:    movs r1, #1
-; CHECK-NEXT:    vdup.32 q4, lr
-; CHECK-NEXT:    bic r0, r11, #3
-; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT:    subs r0, #4
-; CHECK-NEXT:    lsl.w r4, lr, #1
 ; CHECK-NEXT:    mov.w r9, #0
-; CHECK-NEXT:    vshl.i32 q6, q4, #2
+; CHECK-NEXT:    bic r10, r11, #3
+; CHECK-NEXT:    sub.w r0, r10, #4
 ; CHECK-NEXT:    add.w r8, r1, r0, lsr #2
+; CHECK-NEXT:    ldr r1, [sp, #112]
 ; CHECK-NEXT:    lsl.w r0, r11, #1
-; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    adr r0, .LCPI10_0
+; CHECK-NEXT:    vdup.32 q4, r1
 ; CHECK-NEXT:    vldrw.u32 q5, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    lsls r4, r1, #1
+; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    vshl.i32 q6, q4, #2
 ; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str r0, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB10_5
 ; CHECK-NEXT:  .LBB10_3: @ %for.cond5.preheader.us73.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    mov r1, r4
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
 ; CHECK-NEXT:    bl __aeabi_memclr
-; CHECK-NEXT:    ldr.w lr, [sp, #120]
 ; CHECK-NEXT:  .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us
 ; CHECK-NEXT:    @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add r9, r11
-; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    add r1, r0
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    add r1, r0
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    adds r1, #1
 ; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    beq .LBB10_1
@@ -653,21 +652,23 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
 ; CHECK-NEXT:    @ Child Loop BB10_8 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB10_11 Depth 3
 ; CHECK-NEXT:    @ Child Loop BB10_14 Depth 3
-; CHECK-NEXT:    mul r12, r1, lr
+; CHECK-NEXT:    ldr r0, [sp, #112]
 ; CHECK-NEXT:    cmp.w r11, #0
-; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    mul r12, r1, r0
 ; CHECK-NEXT:    beq .LBB10_3
 ; CHECK-NEXT:  @ %bb.6: @ %for.cond5.preheader.us.us.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT:    mov.w r10, #0
+; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    b .LBB10_8
 ; CHECK-NEXT:  .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us
 ; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    add.w r3, r10, r12
-; CHECK-NEXT:    add.w r10, r10, #1
-; CHECK-NEXT:    cmp r10, lr
+; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    add.w r3, r1, r12
+; CHECK-NEXT:    adds r1, #1
 ; CHECK-NEXT:    strh.w r2, [r0, r3, lsl #1]
+; CHECK-NEXT:    ldr r0, [sp, #112]
+; CHECK-NEXT:    cmp r1, r0
 ; CHECK-NEXT:    beq .LBB10_4
 ; CHECK-NEXT:  .LBB10_8: @ %for.cond5.preheader.us.us
 ; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
@@ -677,15 +678,15 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
 ; CHECK-NEXT:    cmp.w r11, #3
 ; CHECK-NEXT:    bhi .LBB10_10
 ; CHECK-NEXT:  @ %bb.9: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT:    movs r7, #0
+; CHECK-NEXT:    movs r5, #0
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    b .LBB10_13
 ; CHECK-NEXT:  .LBB10_10: @ %vector.ph
 ; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
 ; CHECK-NEXT:    vmov q1, q4
-; CHECK-NEXT:    ldr r2, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    vmlas.u32 q1, q5, r10
+; CHECK-NEXT:    vmlas.u32 q1, q5, r1
 ; CHECK-NEXT:    dls lr, r8
 ; CHECK-NEXT:  .LBB10_11: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
@@ -700,29 +701,29 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
 ; CHECK-NEXT:    le lr, .LBB10_11
 ; CHECK-NEXT:  @ %bb.12: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT:    ldr r7, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    vaddv.u32 r2, q0
-; CHECK-NEXT:    ldr.w lr, [sp, #120]
-; CHECK-NEXT:    cmp r7, r11
+; CHECK-NEXT:    cmp r10, r11
+; CHECK-NEXT:    mov r5, r10
 ; CHECK-NEXT:    beq .LBB10_7
 ; CHECK-NEXT:  .LBB10_13: @ %for.body8.us.us.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT:    mla r3, lr, r7, r10
-; CHECK-NEXT:    sub.w r5, r11, r7
-; CHECK-NEXT:    add r7, r9
-; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
-; CHECK-NEXT:    add.w r7, r0, r7, lsl #1
+; CHECK-NEXT:    ldr r0, [sp, #112]
+; CHECK-NEXT:    sub.w lr, r11, r5
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    mla r3, r0, r5, r1
+; CHECK-NEXT:    add r5, r9
+; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    add.w r5, r0, r5, lsl #1
 ; CHECK-NEXT:    add.w r3, r6, r3, lsl #1
 ; CHECK-NEXT:  .LBB10_14: @ %for.body8.us.us
 ; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
 ; CHECK-NEXT:    @ Parent Loop BB10_8 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT:    ldrsh.w r1, [r3]
+; CHECK-NEXT:    ldrsh.w r0, [r3]
 ; CHECK-NEXT:    add r3, r4
-; CHECK-NEXT:    ldrsh r0, [r7], #2
-; CHECK-NEXT:    subs r5, #1
-; CHECK-NEXT:    smlabb r2, r1, r0, r2
-; CHECK-NEXT:    bne .LBB10_14
+; CHECK-NEXT:    ldrsh r7, [r5], #2
+; CHECK-NEXT:    smlabb r2, r0, r7, r2
+; CHECK-NEXT:    le lr, .LBB10_14
 ; CHECK-NEXT:    b .LBB10_7
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.15:


        


More information about the llvm-commits mailing list