[llvm] 146d44c - [LSR] Don't require register reuse under postinc
David Green via llvm-commits
llvm-commits at lists.llvm.org
Tue May 5 08:05:09 PDT 2020
Author: David Green
Date: 2020-05-05T16:04:50+01:00
New Revision: 146d44c2511ab4d87f1d9c45dce56ad71427deab
URL: https://github.com/llvm/llvm-project/commit/146d44c2511ab4d87f1d9c45dce56ad71427deab
DIFF: https://github.com/llvm/llvm-project/commit/146d44c2511ab4d87f1d9c45dce56ad71427deab.diff
LOG: [LSR] Don't require register reuse under postinc
LSR has some logic that tries to aggressively reuse registers in
formula. This can lead to sub-optimal decision in complex loops where
the backend it trying to use shouldFavorPostInc. This disables the
re-use in those situations.
Differential Revision: https://reviews.llvm.org/D79301
Added:
Modified:
llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 36f8c68ef496..652ff6bfb6d8 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -4911,19 +4911,24 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
// Ignore formulae which may not be ideal in terms of register reuse of
// ReqRegs. The formula should use all required registers before
// introducing new ones.
- int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
- for (const SCEV *Reg : ReqRegs) {
- if ((F.ScaledReg && F.ScaledReg == Reg) ||
- is_contained(F.BaseRegs, Reg)) {
- --NumReqRegsToFind;
- if (NumReqRegsToFind == 0)
- break;
+ // This can sometimes (notably when trying to favour postinc) lead to
+ // sub-optimial decisions. There it is best left to the cost modelling to
+ // get correct.
+ if (!TTI.shouldFavorPostInc() || LU.Kind != LSRUse::Address) {
+ int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
+ for (const SCEV *Reg : ReqRegs) {
+ if ((F.ScaledReg && F.ScaledReg == Reg) ||
+ is_contained(F.BaseRegs, Reg)) {
+ --NumReqRegsToFind;
+ if (NumReqRegsToFind == 0)
+ break;
+ }
+ }
+ if (NumReqRegsToFind != 0) {
+ // If none of the formulae satisfied the required registers, then we could
+ // clear ReqRegs and try again. Currently, we simply give up in this case.
+ continue;
}
- }
- if (NumReqRegsToFind != 0) {
- // If none of the formulae satisfied the required registers, then we could
- // clear ReqRegs and try again. Currently, we simply give up in this case.
- continue;
}
// Evaluate the cost of the current formula. If it's already worse than
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
index f5a165bf4df3..8f01326c002f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
@@ -1059,39 +1059,36 @@ define arm_aapcs_vfpcc void @_Z37_arm_radix4_butterfly_inverse_f32_mvePK21arm_cf
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #72
-; CHECK-NEXT: sub sp, #72
+; CHECK-NEXT: .pad #56
+; CHECK-NEXT: sub sp, #56
; CHECK-NEXT: cmp r2, #8
-; CHECK-NEXT: strd r0, r1, [sp, #28] @ 8-byte Folded Spill
-; CHECK-NEXT: vstr s0, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: vstr s0, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: mov r1, r2
-; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: blo.w .LBB7_9
; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT: movs r6, #1
-; CHECK-NEXT: str r2, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: mov.w r10, #0
+; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: lsrs r1, r2, #2
-; CHECK-NEXT: movs r2, #0
-; CHECK-NEXT: str r2, [sp, #48] @ 4-byte Spill
; CHECK-NEXT: b .LBB7_3
; CHECK-NEXT: .LBB7_2: @ in Loop: Header=BB7_3 Depth=1
-; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT: lsls r6, r6, #2
-; CHECK-NEXT: ldr r2, [sp, #48] @ 4-byte Reload
-; CHECK-NEXT: cmp r3, #7
-; CHECK-NEXT: asr.w r1, r3, #2
-; CHECK-NEXT: add.w r2, r2, #1
-; CHECK-NEXT: str r2, [sp, #48] @ 4-byte Spill
+; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: add.w r10, r10, #1
+; CHECK-NEXT: lsls r3, r3, #2
+; CHECK-NEXT: cmp r2, #7
+; CHECK-NEXT: asr.w r1, r2, #2
; CHECK-NEXT: ble .LBB7_9
; CHECK-NEXT: .LBB7_3: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB7_6 Depth 2
; CHECK-NEXT: @ Child Loop BB7_7 Depth 3
-; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT: cmp r6, #1
-; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: cmp r3, #1
+; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: lsr.w r2, r1, #2
-; CHECK-NEXT: str r2, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: blt .LBB7_2
; CHECK-NEXT: @ %bb.4: @ in Loop: Header=BB7_3 Depth=1
; CHECK-NEXT: movs r2, #0
@@ -1100,86 +1097,81 @@ define arm_aapcs_vfpcc void @_Z37_arm_radix4_butterfly_inverse_f32_mvePK21arm_cf
; CHECK-NEXT: @ %bb.5: @ %.preheader
; CHECK-NEXT: @ in Loop: Header=BB7_3 Depth=1
; CHECK-NEXT: lsrs r2, r1, #3
-; CHECK-NEXT: lsls r0, r1, #1
-; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
-; CHECK-NEXT: mov.w r10, #0
-; CHECK-NEXT: str r2, [sp, #40] @ 4-byte Spill
-; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT: lsls r5, r1, #3
-; CHECK-NEXT: lsls r7, r1, #4
-; CHECK-NEXT: add.w r1, r1, r1, lsl #1
-; CHECK-NEXT: str r6, [sp, #44] @ 4-byte Spill
-; CHECK-NEXT: lsls r3, r1, #3
+; CHECK-NEXT: lsls r1, r1, #1
+; CHECK-NEXT: str r2, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT: movs r5, #0
+; CHECK-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: str r3, [sp, #32] @ 4-byte Spill
+; CHECK-NEXT: lsl.w r11, r2, #1
; CHECK-NEXT: .LBB7_6: @ Parent Loop BB7_3 Depth=1
; CHECK-NEXT: @ => This Loop Header: Depth=2
; CHECK-NEXT: @ Child Loop BB7_7 Depth 3
-; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT: ldr r6, [sp, #48] @ 4-byte Reload
-; CHECK-NEXT: ldrd r4, r0, [r1, #24]
-; CHECK-NEXT: ldrd r12, r2, [r1, #16]
-; CHECK-NEXT: ldrd r8, r9, [r1, #32]
-; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
-; CHECK-NEXT: ldr.w lr, [sp, #40] @ 4-byte Reload
-; CHECK-NEXT: ldr.w r4, [r4, r6, lsl #2]
-; CHECK-NEXT: mul r1, r1, r10
-; CHECK-NEXT: ldr.w r11, [r2, r6, lsl #2]
-; CHECK-NEXT: ldr.w r6, [r12, r6, lsl #2]
+; CHECK-NEXT: add.w r12, r0, #16
+; CHECK-NEXT: ldr r4, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: ldr.w lr, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: ldm.w r12, {r1, r2, r3, r12}
+; CHECK-NEXT: muls r4, r5, r4
+; CHECK-NEXT: ldr.w r2, [r2, r10, lsl #2]
+; CHECK-NEXT: ldr.w r1, [r1, r10, lsl #2]
+; CHECK-NEXT: ldrd r6, r7, [r0, #32]
+; CHECK-NEXT: ldr.w r3, [r3, r10, lsl #2]
; CHECK-NEXT: dls lr, lr
-; CHECK-NEXT: ldr r2, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT: add.w r12, r2, r1, lsl #2
-; CHECK-NEXT: add.w r2, r9, r4, lsl #2
+; CHECK-NEXT: add.w r6, r6, r2, lsl #2
+; CHECK-NEXT: add.w r12, r12, r1, lsl #2
+; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: add.w r2, r1, r4, lsl #2
+; CHECK-NEXT: add.w r3, r7, r3, lsl #2
+; CHECK-NEXT: add.w r1, r2, r11, lsl #2
+; CHECK-NEXT: add.w r8, r1, r11, lsl #2
; CHECK-NEXT: add.w r9, r8, r11, lsl #2
-; CHECK-NEXT: add.w r8, r0, r6, lsl #2
; CHECK-NEXT: .LBB7_7: @ Parent Loop BB7_3 Depth=1
; CHECK-NEXT: @ Parent Loop BB7_6 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT: add.w r4, r12, r7
-; CHECK-NEXT: add.w r1, r12, r3
-; CHECK-NEXT: add.w r6, r12, r5
-; CHECK-NEXT: vldrw.u32 q3, [r12]
-; CHECK-NEXT: vldrw.u32 q7, [r4]
+; CHECK-NEXT: vldrw.u32 q3, [r9]
; CHECK-NEXT: vldrw.u32 q4, [r1]
-; CHECK-NEXT: vldrw.u32 q5, [r6]
-; CHECK-NEXT: vsub.f32 q0, q3, q7
-; CHECK-NEXT: vadd.f32 q3, q7, q3
-; CHECK-NEXT: vsub.f32 q6, q5, q4
-; CHECK-NEXT: vcadd.f32 q1, q0, q6, #270
-; CHECK-NEXT: vcadd.f32 q2, q0, q6, #90
-; CHECK-NEXT: vadd.f32 q0, q5, q4
+; CHECK-NEXT: vldrw.u32 q6, [r8]
+; CHECK-NEXT: vldrw.u32 q7, [r2]
+; CHECK-NEXT: vsub.f32 q5, q4, q3
+; CHECK-NEXT: vsub.f32 q0, q7, q6
+; CHECK-NEXT: vcadd.f32 q1, q0, q5, #270
+; CHECK-NEXT: vcadd.f32 q2, q0, q5, #90
+; CHECK-NEXT: vadd.f32 q0, q4, q3
+; CHECK-NEXT: vadd.f32 q3, q6, q7
; CHECK-NEXT: vsub.f32 q4, q3, q0
; CHECK-NEXT: vadd.f32 q0, q3, q0
-; CHECK-NEXT: vstrb.8 q0, [r12], #16
-; CHECK-NEXT: vldrw.u32 q0, [r9], #16
+; CHECK-NEXT: vstrb.8 q0, [r2], #16
+; CHECK-NEXT: vldrw.u32 q0, [r6], #16
; CHECK-NEXT: vcmul.f32 q3, q0, q4, #0
; CHECK-NEXT: vcmla.f32 q3, q0, q4, #90
-; CHECK-NEXT: vstrw.32 q3, [r6]
-; CHECK-NEXT: vldrw.u32 q0, [r8], #16
+; CHECK-NEXT: vstrb.8 q3, [r1], #16
+; CHECK-NEXT: vldrw.u32 q0, [r12], #16
; CHECK-NEXT: vcmul.f32 q3, q0, q2, #0
; CHECK-NEXT: vcmla.f32 q3, q0, q2, #90
-; CHECK-NEXT: vstrw.32 q3, [r4]
-; CHECK-NEXT: vldrw.u32 q0, [r2], #16
+; CHECK-NEXT: vstrb.8 q3, [r8], #16
+; CHECK-NEXT: vldrw.u32 q0, [r3], #16
; CHECK-NEXT: vcmul.f32 q2, q0, q1, #0
; CHECK-NEXT: vcmla.f32 q2, q0, q1, #90
-; CHECK-NEXT: vstrw.32 q2, [r1]
+; CHECK-NEXT: vstrb.8 q2, [r9], #16
; CHECK-NEXT: le lr, .LBB7_7
; CHECK-NEXT: @ %bb.8: @ in Loop: Header=BB7_6 Depth=2
-; CHECK-NEXT: ldr r6, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT: add.w r10, r10, #1
-; CHECK-NEXT: cmp r10, r6
+; CHECK-NEXT: ldr r3, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT: adds r5, #1
+; CHECK-NEXT: cmp r5, r3
; CHECK-NEXT: bne .LBB7_6
; CHECK-NEXT: b .LBB7_2
; CHECK-NEXT: .LBB7_9:
; CHECK-NEXT: adr r0, .LCPI7_0
; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: vadd.i32 q1, q1, r0
; CHECK-NEXT: vldrw.u32 q2, [q1, #64]!
-; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: lsr.w lr, r0, #3
; CHECK-NEXT: wls lr, lr, .LBB7_12
; CHECK-NEXT: @ %bb.10:
; CHECK-NEXT: vldrw.u32 q3, [q1, #16]
-; CHECK-NEXT: vldr s0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: vldr s0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: .LBB7_11: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [q1, #24]
@@ -1190,14 +1182,14 @@ define arm_aapcs_vfpcc void @_Z37_arm_radix4_butterfly_inverse_f32_mvePK21arm_cf
; CHECK-NEXT: vsub.f32 q0, q4, q0
; CHECK-NEXT: vsub.f32 q7, q6, q5
; CHECK-NEXT: vcadd.f32 q4, q2, q0, #270
-; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill
; CHECK-NEXT: vcadd.f32 q7, q2, q0, #90
; CHECK-NEXT: vadd.f32 q0, q6, q5
; CHECK-NEXT: vldrw.u32 q2, [q1, #64]!
; CHECK-NEXT: vmul.f32 q0, q0, r0
; CHECK-NEXT: vldrw.u32 q3, [q1, #16]
; CHECK-NEXT: vstrw.32 q0, [q1, #-64]
-; CHECK-NEXT: vldrw.u32 q5, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: vmul.f32 q0, q4, r0
; CHECK-NEXT: vmul.f32 q4, q7, r0
; CHECK-NEXT: vmul.f32 q5, q5, r0
@@ -1206,7 +1198,7 @@ define arm_aapcs_vfpcc void @_Z37_arm_radix4_butterfly_inverse_f32_mvePK21arm_cf
; CHECK-NEXT: vstrw.32 q0, [q1, #-40]
; CHECK-NEXT: le lr, .LBB7_11
; CHECK-NEXT: .LBB7_12:
-; CHECK-NEXT: add sp, #72
+; CHECK-NEXT: add sp, #56
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
More information about the llvm-commits
mailing list