[llvm] f5abf0b - [ARM] Tail predication with constant loop bounds
David Green via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 15 10:17:53 PST 2021
Author: David Green
Date: 2021-01-15T18:17:31Z
New Revision: f5abf0bd485a1fa7e332f5f8266c25755d385a8a
URL: https://github.com/llvm/llvm-project/commit/f5abf0bd485a1fa7e332f5f8266c25755d385a8a
DIFF: https://github.com/llvm/llvm-project/commit/f5abf0bd485a1fa7e332f5f8266c25755d385a8a.diff
LOG: [ARM] Tail predication with constant loop bounds
The TripCount for a predicated vector loop body will be
ceil(ElementCount/Width). This alters the conversion of an
active.lane.mask to a VCPT intrinsics to match.
Differential Revision: https://reviews.llvm.org/D94608
Added:
Modified:
llvm/lib/Target/ARM/MVETailPredication.cpp
llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll
llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index 8055b5cf500d..b705208660df 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -230,18 +230,16 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
}
// Calculate 2 tripcount values and check that they are consistent with
- // each other:
- // i) The number of loop iterations extracted from the set.loop.iterations
- // intrinsic, multipled by the vector width:
- uint64_t TC1 = TC->getZExtValue() * VectorWidth;
-
- // ii) TC1 has to be equal to TC + 1, with the + 1 to compensate for start
- // counting from 0.
- uint64_t TC2 = ConstElemCount->getZExtValue() + 1;
-
- // If the tripcount values are inconsistent, we don't want to insert the
- // VCTP and trigger tail-predication; it's better to keep intrinsic
- // get.active.lane.mask and legalize this.
+ // each other. The TripCount for a predicated vector loop body is
+ // ceil(ElementCount/Width), or floor((ElementCount+Width-1)/Width) as we
+ // work it out here.
+ uint64_t TC1 = TC->getZExtValue();
+ uint64_t TC2 =
+ (ConstElemCount->getZExtValue() + VectorWidth - 1) / VectorWidth;
+
+ // If the tripcount values are inconsistent, we can't insert the VCTP and
+ // trigger tail-predication; keep the intrinsic as a get.active.lane.mask
+ // and legalize this.
if (TC1 != TC2) {
LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: "
<< TC1 << " from set.loop.iterations, and "
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll
index 480680bee89d..d1f5a07bc4a9 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll
@@ -62,41 +62,17 @@ define dso_local i32 @test_501_504(i32* nocapture readonly %x) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: adr r2, .LCPI1_0
-; CHECK-NEXT: mov.w lr, #126
-; CHECK-NEXT: vldrw.u32 q0, [r2]
-; CHECK-NEXT: adr r2, .LCPI1_1
-; CHECK-NEXT: vldrw.u32 q1, [r2]
-; CHECK-NEXT: dls lr, lr
-; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: movw r1, #501
; CHECK-NEXT: movs r2, #0
+; CHECK-NEXT: dlstp.32 lr, r1
; CHECK-NEXT: .LBB1_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vadd.i32 q2, q0, r1
-; CHECK-NEXT: vdup.32 q3, r1
-; CHECK-NEXT: vcmp.u32 hi, q3, q2
-; CHECK-NEXT: adds r1, #4
-; CHECK-NEXT: vpnot
-; CHECK-NEXT: vpsttt
-; CHECK-NEXT: vcmpt.u32 hi, q1, q2
-; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
-; CHECK-NEXT: vaddvat.u32 r2, q2
-; CHECK-NEXT: le lr, .LBB1_1
+; CHECK-NEXT: vldrw.u32 q0, [r0], #16
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: letp lr, .LBB1_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: @ %bb.3:
-; CHECK-NEXT: .LCPI1_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 1 @ 0x1
-; CHECK-NEXT: .long 2 @ 0x2
-; CHECK-NEXT: .long 3 @ 0x3
-; CHECK-NEXT: .LCPI1_1:
-; CHECK-NEXT: .long 501 @ 0x1f5
-; CHECK-NEXT: .long 501 @ 0x1f5
-; CHECK-NEXT: .long 501 @ 0x1f5
-; CHECK-NEXT: .long 501 @ 0x1f5
entry:
br label %vector.body
@@ -123,41 +99,17 @@ define dso_local i32 @test_502_504(i32* nocapture readonly %x) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: adr r2, .LCPI2_0
-; CHECK-NEXT: mov.w lr, #126
-; CHECK-NEXT: vldrw.u32 q0, [r2]
-; CHECK-NEXT: adr r2, .LCPI2_1
-; CHECK-NEXT: vldrw.u32 q1, [r2]
-; CHECK-NEXT: dls lr, lr
-; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: mov.w r1, #502
; CHECK-NEXT: movs r2, #0
+; CHECK-NEXT: dlstp.32 lr, r1
; CHECK-NEXT: .LBB2_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vadd.i32 q2, q0, r1
-; CHECK-NEXT: vdup.32 q3, r1
-; CHECK-NEXT: vcmp.u32 hi, q3, q2
-; CHECK-NEXT: adds r1, #4
-; CHECK-NEXT: vpnot
-; CHECK-NEXT: vpsttt
-; CHECK-NEXT: vcmpt.u32 hi, q1, q2
-; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
-; CHECK-NEXT: vaddvat.u32 r2, q2
-; CHECK-NEXT: le lr, .LBB2_1
+; CHECK-NEXT: vldrw.u32 q0, [r0], #16
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: letp lr, .LBB2_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: @ %bb.3:
-; CHECK-NEXT: .LCPI2_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 1 @ 0x1
-; CHECK-NEXT: .long 2 @ 0x2
-; CHECK-NEXT: .long 3 @ 0x3
-; CHECK-NEXT: .LCPI2_1:
-; CHECK-NEXT: .long 502 @ 0x1f6
-; CHECK-NEXT: .long 502 @ 0x1f6
-; CHECK-NEXT: .long 502 @ 0x1f6
-; CHECK-NEXT: .long 502 @ 0x1f6
entry:
br label %vector.body
@@ -221,36 +173,17 @@ define dso_local i32 @test_504_504(i32* nocapture readonly %x) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: mov.w lr, #126
-; CHECK-NEXT: adr r2, .LCPI4_0
-; CHECK-NEXT: vldrw.u32 q0, [r2]
-; CHECK-NEXT: mov.w r2, #504
-; CHECK-NEXT: dls lr, lr
-; CHECK-NEXT: vdup.32 q1, r2
-; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: mov.w r1, #504
; CHECK-NEXT: movs r2, #0
+; CHECK-NEXT: dlstp.32 lr, r1
; CHECK-NEXT: .LBB4_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vadd.i32 q2, q0, r1
-; CHECK-NEXT: vdup.32 q3, r1
-; CHECK-NEXT: vcmp.u32 hi, q3, q2
-; CHECK-NEXT: adds r1, #4
-; CHECK-NEXT: vpnot
-; CHECK-NEXT: vpsttt
-; CHECK-NEXT: vcmpt.u32 hi, q1, q2
-; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
-; CHECK-NEXT: vaddvat.u32 r2, q2
-; CHECK-NEXT: le lr, .LBB4_1
+; CHECK-NEXT: vldrw.u32 q0, [r0], #16
+; CHECK-NEXT: vaddva.u32 r2, q0
+; CHECK-NEXT: letp lr, .LBB4_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: pop {r7, pc}
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: @ %bb.3:
-; CHECK-NEXT: .LCPI4_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 1 @ 0x1
-; CHECK-NEXT: .long 2 @ 0x2
-; CHECK-NEXT: .long 3 @ 0x3
entry:
br label %vector.body
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
index 64c00ed03032..7777d7a6894a 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll
@@ -5,53 +5,25 @@ define dso_local arm_aapcs_vfpcc i32 @minmaxval4(i32* nocapture readonly %x, i32
; CHECK-LABEL: minmaxval4:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: sub sp, #8
-; CHECK-NEXT: mov.w lr, #3
-; CHECK-NEXT: adr r3, .LCPI0_0
-; CHECK-NEXT: dls lr, lr
-; CHECK-NEXT: vldrw.u32 q2, [r3]
; CHECK-NEXT: vmov.i32 q0, #0x80000000
; CHECK-NEXT: vmvn.i32 q1, #0x80000000
-; CHECK-NEXT: movs r2, #0
-; CHECK-NEXT: vmov.i32 q3, #0xa
+; CHECK-NEXT: movs r2, #10
+; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB0_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vadd.i32 q4, q2, r2
-; CHECK-NEXT: vdup.32 q5, r2
-; CHECK-NEXT: vcmp.u32 hi, q5, q4
-; CHECK-NEXT: adds r2, #4
-; CHECK-NEXT: vpnot
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vcmpt.u32 hi, q3, q4
-; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrwt.u32 q4, [r0], #16
-; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vcmpt.s32 gt, q4, q0
-; CHECK-NEXT: vpsel q0, q4, q0
-; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vcmpt.s32 gt, q1, q4
-; CHECK-NEXT: vpsel q1, q4, q1
-; CHECK-NEXT: le lr, .LBB0_1
+; CHECK-NEXT: vldrw.u32 q2, [r0], #16
+; CHECK-NEXT: vpt.s32 gt, q2, q0
+; CHECK-NEXT: vmovt q0, q2
+; CHECK-NEXT: vpt.s32 gt, q1, q2
+; CHECK-NEXT: vmovt q1, q2
+; CHECK-NEXT: letp lr, .LBB0_1
; CHECK-NEXT: @ %bb.2: @ %middle.block
; CHECK-NEXT: mvn r0, #-2147483648
; CHECK-NEXT: vminv.s32 r0, q1
; CHECK-NEXT: str r0, [r1]
; CHECK-NEXT: mov.w r0, #-2147483648
; CHECK-NEXT: vmaxv.s32 r0, q0
-; CHECK-NEXT: add sp, #8
-; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: pop {r7, pc}
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: @ %bb.3:
-; CHECK-NEXT: .LCPI0_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 1 @ 0x1
-; CHECK-NEXT: .long 2 @ 0x2
-; CHECK-NEXT: .long 3 @ 0x3
entry:
br label %vector.body
More information about the llvm-commits
mailing list