[llvm] 1bcb6a3 - [MBP] Enable duplicating return block to remove jump to return
Guozhi Wei via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 21 11:57:24 PDT 2023
Author: Guozhi Wei
Date: 2023-06-21T18:54:31Z
New Revision: 1bcb6a3da231ee3bcf8513880599b5d054f590a4
URL: https://github.com/llvm/llvm-project/commit/1bcb6a3da231ee3bcf8513880599b5d054f590a4
DIFF: https://github.com/llvm/llvm-project/commit/1bcb6a3da231ee3bcf8513880599b5d054f590a4.diff
LOG: [MBP] Enable duplicating return block to remove jump to return
Sometimes LLVM generates branch to return instruction, like PR63227.
It is because in function MachineBlockPlacement::canTailDuplicateUnplacedPreds
we avoid duplicating a BB into another already placed BB to prevent destroying
computed layout. But if the successor BB is a return block, duplicating it will
only reduce taken branches without hurt to any other branches.
Differential Revision: https://reviews.llvm.org/D153093
Added:
llvm/test/CodeGen/Thumb/branch-to-return.ll
Modified:
llvm/lib/CodeGen/MachineBlockPlacement.cpp
llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index eec602279b707..912e9ec993e3c 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -1159,7 +1159,7 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
// tail-duplicated into.
// Skip any blocks that are already placed or not in this loop.
if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
- || BlockToChain[Pred] == &Chain)
+ || (BlockToChain[Pred] == &Chain && !Succ->succ_empty()))
continue;
if (!TailDup.canTailDuplicate(Succ, Pred)) {
if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 8dd4da1ee4401..e4c776b000a13 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -425,7 +425,8 @@ define i16 @red_mla_dup_ext_u8_s8_s16(i8* noalias nocapture noundef readonly %A,
; CHECK-NEXT: b .LBB5_7
; CHECK-NEXT: .LBB5_3:
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: b .LBB5_9
+; CHECK-NEXT: mov w0, w8
+; CHECK-NEXT: ret
; CHECK-NEXT: .LBB5_4: // %vector.ph
; CHECK-NEXT: and x11, x10, #0xfffffff0
; CHECK-NEXT: add x8, x0, #8
diff --git a/llvm/test/CodeGen/Thumb/branch-to-return.ll b/llvm/test/CodeGen/Thumb/branch-to-return.ll
new file mode 100644
index 0000000000000..5bfccc0637550
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb/branch-to-return.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv7 %s -o - | FileCheck %s
+
+; Test the branch to return in BB4 is converted to return.
+
+define i32 @foo(i32* %x, i32 %n) {
+; CHECK-LABEL: foo:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: cmp r1, #1
+; CHECK-NEXT: blt .LBB0_4
+; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT: bic r3, r1, #3
+; CHECK-NEXT: mov r12, r0
+; CHECK-NEXT: cmp r1, #4
+; CHECK-NEXT: bhs .LBB0_3
+; CHECK-NEXT: @ %bb.2:
+; CHECK-NEXT: movs r0, #0
+; CHECK-NEXT: b .LBB0_6
+; CHECK-NEXT: .LBB0_3: @ %middle.block
+; CHECK-NEXT: cmp r1, r3
+; CHECK-NEXT: bne .LBB0_5
+; CHECK-NEXT: .LBB0_4:
+; CHECK-NEXT: movs r0, #0
+; CHECK-NEXT: bx lr
+; CHECK-NEXT: .LBB0_5:
+; CHECK-NEXT: ldr.w r0, [r12]
+; CHECK-NEXT: .LBB0_6: @ %for.body.preheader1
+; CHECK-NEXT: subs r3, r1, r3
+; CHECK-NEXT: mvn r2, #12
+; CHECK-NEXT: and.w r1, r2, r1, lsl #2
+; CHECK-NEXT: add r1, r12
+; CHECK-NEXT: .LBB0_7: @ %for.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr r2, [r1], #4
+; CHECK-NEXT: subs r3, #1
+; CHECK-NEXT: add r0, r2
+; CHECK-NEXT: bne .LBB0_7
+; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup
+; CHECK-NEXT: bx lr
+entry:
+ %n.vec = and i32 %n, -4
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %min.iters.check = icmp ult i32 %n, 4
+ br i1 %min.iters.check, label %for.body.preheader1, label %middle.block
+
+middle.block:
+ %x3 = load i32, i32* %x, align 4
+ %cmp.n = icmp eq i32 %n.vec, %n
+ br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
+
+for.body.preheader1: ; preds = %middle.block, %for.body.preheader
+ %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %x3, %middle.block ]
+ br label %for.body
+
+for.body: ; preds = %for.body.preheader1, %for.body
+ %i.08 = phi i32 [ %inc, %for.body ], [ %n.vec, %for.body.preheader1 ]
+ %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
+ %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
+ %v5 = load i32, i32* %arrayidx, align 4
+ %add = add nsw i32 %v5, %r.07
+ %inc = add nuw nsw i32 %i.08, 1
+ %exitcond = icmp eq i32 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
+ %r.0.lcssa = phi i32 [ 0, %entry ], [ 0, %middle.block ], [ %add, %for.body ]
+ ret i32 %r.0.lcssa
+}
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
index 23eb5900bb7d1..cc6d0925d1803 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
@@ -1417,7 +1417,7 @@ define arm_aapcs_vfpcc float @half_half_mac(ptr nocapture readonly %a, ptr nocap
; CHECK-NEXT: b .LBB9_6
; CHECK-NEXT: .LBB9_3:
; CHECK-NEXT: vldr s0, .LCPI9_0
-; CHECK-NEXT: b .LBB9_9
+; CHECK-NEXT: pop {r4, r5, r7, pc}
; CHECK-NEXT: .LBB9_4: @ %for.body.preheader.new
; CHECK-NEXT: bic r2, r2, #3
; CHECK-NEXT: movs r3, #1
@@ -1572,7 +1572,7 @@ define arm_aapcs_vfpcc float @half_half_acc(ptr nocapture readonly %a, ptr nocap
; CHECK-NEXT: b .LBB10_6
; CHECK-NEXT: .LBB10_3:
; CHECK-NEXT: vldr s0, .LCPI10_0
-; CHECK-NEXT: b .LBB10_9
+; CHECK-NEXT: pop {r4, r5, r7, pc}
; CHECK-NEXT: .LBB10_4: @ %for.body.preheader.new
; CHECK-NEXT: bic r2, r2, #3
; CHECK-NEXT: movs r3, #1
@@ -1727,7 +1727,7 @@ define arm_aapcs_vfpcc float @half_short_mac(ptr nocapture readonly %a, ptr noca
; CHECK-NEXT: b .LBB11_6
; CHECK-NEXT: .LBB11_3:
; CHECK-NEXT: vldr s0, .LCPI11_0
-; CHECK-NEXT: b .LBB11_9
+; CHECK-NEXT: pop {r4, r5, r6, pc}
; CHECK-NEXT: .LBB11_4: @ %for.body.preheader.new
; CHECK-NEXT: bic r2, r2, #3
; CHECK-NEXT: movs r3, #1
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
index f5adcf0427649..6ab1a9344bb23 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
@@ -18,7 +18,7 @@ define i32 @add_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB0_7
; CHECK-NEXT: .LBB0_3:
; CHECK-NEXT: movs r0, #0
-; CHECK-NEXT: b .LBB0_9
+; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB0_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
@@ -43,7 +43,7 @@ define i32 @add_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: ldr r1, [r2], #4
; CHECK-NEXT: add r0, r1
; CHECK-NEXT: le lr, .LBB0_8
-; CHECK-NEXT: .LBB0_9: @ %for.cond.cleanup
+; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
@@ -201,7 +201,8 @@ define i32 @and_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB2_7
; CHECK-NEXT: .LBB2_3:
; CHECK-NEXT: mov.w r2, #-1
-; CHECK-NEXT: b .LBB2_9
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .LBB2_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
@@ -297,7 +298,8 @@ define i32 @or_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB3_7
; CHECK-NEXT: .LBB3_3:
; CHECK-NEXT: movs r2, #0
-; CHECK-NEXT: b .LBB3_9
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .LBB3_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
@@ -393,7 +395,8 @@ define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB4_7
; CHECK-NEXT: .LBB4_3:
; CHECK-NEXT: movs r2, #0
-; CHECK-NEXT: b .LBB4_9
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .LBB4_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
@@ -489,7 +492,8 @@ define float @fadd_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB5_7
; CHECK-NEXT: .LBB5_3:
; CHECK-NEXT: vldr s0, .LCPI5_0
-; CHECK-NEXT: b .LBB5_9
+; CHECK-NEXT: vmov r0, s0
+; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB5_4: @ %vector.ph
; CHECK-NEXT: bic r2, r1, #3
; CHECK-NEXT: movs r3, #1
@@ -587,7 +591,8 @@ define float @fmul_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB6_7
; CHECK-NEXT: .LBB6_3:
; CHECK-NEXT: vmov.f32 s0, #1.000000e+00
-; CHECK-NEXT: b .LBB6_9
+; CHECK-NEXT: vmov r0, s0
+; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB6_4: @ %vector.ph
; CHECK-NEXT: bic r2, r1, #3
; CHECK-NEXT: movs r3, #1
@@ -681,7 +686,8 @@ define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB7_7
; CHECK-NEXT: .LBB7_3:
; CHECK-NEXT: mvn r2, #-2147483648
-; CHECK-NEXT: b .LBB7_9
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB7_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
@@ -778,7 +784,7 @@ define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB8_7
; CHECK-NEXT: .LBB8_3:
; CHECK-NEXT: mvn r0, #-2147483648
-; CHECK-NEXT: b .LBB8_9
+; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB8_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
@@ -804,7 +810,7 @@ define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: csel r0, r0, r1, lt
; CHECK-NEXT: le lr, .LBB8_8
-; CHECK-NEXT: .LBB8_9: @ %for.cond.cleanup
+; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
@@ -873,7 +879,8 @@ define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB9_7
; CHECK-NEXT: .LBB9_3:
; CHECK-NEXT: mov.w r2, #-2147483648
-; CHECK-NEXT: b .LBB9_9
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB9_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
@@ -970,7 +977,7 @@ define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB10_7
; CHECK-NEXT: .LBB10_3:
; CHECK-NEXT: mov.w r0, #-2147483648
-; CHECK-NEXT: b .LBB10_9
+; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB10_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
@@ -996,7 +1003,7 @@ define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: csel r0, r0, r1, gt
; CHECK-NEXT: le lr, .LBB10_8
-; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup
+; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
@@ -1065,7 +1072,8 @@ define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB11_7
; CHECK-NEXT: .LBB11_3:
; CHECK-NEXT: mov.w r2, #-1
-; CHECK-NEXT: b .LBB11_9
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB11_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
@@ -1162,7 +1170,7 @@ define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB12_7
; CHECK-NEXT: .LBB12_3:
; CHECK-NEXT: mov.w r0, #-1
-; CHECK-NEXT: b .LBB12_9
+; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB12_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
@@ -1188,7 +1196,7 @@ define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: csel r0, r0, r1, hi
; CHECK-NEXT: le lr, .LBB12_8
-; CHECK-NEXT: .LBB12_9: @ %for.cond.cleanup
+; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
@@ -1257,7 +1265,8 @@ define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB13_7
; CHECK-NEXT: .LBB13_3:
; CHECK-NEXT: movs r2, #0
-; CHECK-NEXT: b .LBB13_9
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB13_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
@@ -1354,7 +1363,7 @@ define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB14_7
; CHECK-NEXT: .LBB14_3:
; CHECK-NEXT: movs r0, #0
-; CHECK-NEXT: b .LBB14_9
+; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB14_4: @ %vector.ph
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: movs r2, #1
@@ -1380,7 +1389,7 @@ define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: csel r0, r0, r1, hi
; CHECK-NEXT: le lr, .LBB14_8
-; CHECK-NEXT: .LBB14_9: @ %for.cond.cleanup
+; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
%cmp6 = icmp sgt i32 %n, 0
@@ -1449,7 +1458,8 @@ define float @fmin_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB15_7
; CHECK-NEXT: .LBB15_3:
; CHECK-NEXT: vldr s0, .LCPI15_0
-; CHECK-NEXT: b .LBB15_9
+; CHECK-NEXT: vmov r0, s0
+; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB15_4: @ %vector.ph
; CHECK-NEXT: bic r2, r1, #3
; CHECK-NEXT: movs r3, #1
@@ -1552,7 +1562,8 @@ define float @fmax_f32(float* nocapture readonly %x, i32 %n) {
; CHECK-NEXT: b .LBB16_7
; CHECK-NEXT: .LBB16_3:
; CHECK-NEXT: vldr s0, .LCPI16_0
-; CHECK-NEXT: b .LBB16_9
+; CHECK-NEXT: vmov r0, s0
+; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB16_4: @ %vector.ph
; CHECK-NEXT: bic r2, r1, #3
; CHECK-NEXT: movs r3, #1
More information about the llvm-commits
mailing list