[llvm] [ARM] Don't block tail-predication from unrelated VPT blocks. (PR #94239)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 3 09:13:37 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-arm
Author: David Green (davemgreen)
<details>
<summary>Changes</summary>
VPT blocks that do not produce an interesting 'output' (like a stored value or reduction result), do not need to be predicated on vctp for the whole loop to be tail-predicated. Just producing results for the valid tail predication lanes should be enough.
---
Full diff: https://github.com/llvm/llvm-project/pull/94239.diff
2 Files Affected:
- (modified) llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp (+16-6)
- (modified) llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll (+5-20)
``````````diff
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index a46c383115e2d..d7c89ca6c6a57 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -115,6 +115,12 @@ static bool shouldInspect(MachineInstr &MI) {
return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
}
+static bool isHorizontalReduction(const MachineInstr &MI) {
+ const MCInstrDesc &MCID = MI.getDesc();
+ uint64_t Flags = MCID.TSFlags;
+ return (Flags & ARMII::HorizontalReduction) != 0;
+}
+
namespace {
using InstSet = SmallPtrSetImpl<MachineInstr *>;
@@ -275,6 +281,16 @@ namespace {
if (VPT->getOpcode() == ARM::MVE_VPST)
return false;
+ // If the VPT block does not define something that is an "output", then the
+ // tail-predicated version will just perform a subset of the original vpt
+ // block, where the last lanes should not be used.
+ if (isVPTOpcode(VPT->getOpcode()) &&
+ all_of(Block.getInsts(), [](const MachineInstr *MI) {
+ return !MI->mayStore() && !isHorizontalReduction(*MI) &&
+ !isVCTP(MI);
+ }))
+ return true;
+
auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) {
MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx));
return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op);
@@ -813,12 +829,6 @@ static bool producesDoubleWidthResult(const MachineInstr &MI) {
return (Flags & ARMII::DoubleWidthResult) != 0;
}
-static bool isHorizontalReduction(const MachineInstr &MI) {
- const MCInstrDesc &MCID = MI.getDesc();
- uint64_t Flags = MCID.TSFlags;
- return (Flags & ARMII::HorizontalReduction) != 0;
-}
-
// Can this instruction generate a non-zero result when given only zeroed
// operands? This allows us to know that, given operands with false bytes
// zeroed by masked loads, that the result will also contain zeros in those
diff --git a/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll b/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll
index f9b3757bb6d2c..6392452e06cdd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll
@@ -20,50 +20,35 @@ define void @convert_vptblock(ptr %pchTarget, i16 signext %iTargetStride, ptr %p
; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: ldrd r4, r5, [sp, #88]
; CHECK-NEXT: mov r7, r0
-; CHECK-NEXT: cmp.w r10, #8
-; CHECK-NEXT: mov.w r0, #1
-; CHECK-NEXT: mov r3, r10
; CHECK-NEXT: mov.w r11, #0
-; CHECK-NEXT: it ge
-; CHECK-NEXT: movge r3, #8
; CHECK-NEXT: vidup.u16 q0, r8, #4
-; CHECK-NEXT: sub.w r3, r10, r3
; CHECK-NEXT: vmov.i32 q1, #0x0
-; CHECK-NEXT: adds r3, #7
; CHECK-NEXT: vmov.i16 q2, #0x100
; CHECK-NEXT: vmov.i16 q3, #0xff
-; CHECK-NEXT: add.w r9, r0, r3, lsr #3
; CHECK-NEXT: .LBB0_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB0_3 Depth 2
-; CHECK-NEXT: mov r3, r10
; CHECK-NEXT: vmov q4, q0
; CHECK-NEXT: mov r6, r8
; CHECK-NEXT: mov r0, r7
-; CHECK-NEXT: dls lr, r9
+; CHECK-NEXT: dlstp.16 lr, r10
; CHECK-NEXT: .LBB0_3: @ %do.body
; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vctp.16 r3
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrbt.u16 q5, [r2, q4]
+; CHECK-NEXT: vldrb.u16 q5, [r2, q4]
; CHECK-NEXT: vmul.i16 q4, q5, r5
; CHECK-NEXT: vshr.u16 q4, q4, #8
; CHECK-NEXT: vsub.i16 q5, q2, q4
; CHECK-NEXT: vpt.i16 eq, q4, q3
; CHECK-NEXT: vmovt q5, q1
-; CHECK-NEXT: vctp.16 r3
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrbt.u16 q6, [r0]
+; CHECK-NEXT: vldrb.u16 q6, [r0]
; CHECK-NEXT: vsub.i16 q4, q2, q5
-; CHECK-NEXT: subs r3, #8
; CHECK-NEXT: vmul.i16 q5, q5, q6
; CHECK-NEXT: vmla.i16 q5, q4, r4
; CHECK-NEXT: vshr.u16 q4, q5, #8
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vstrbt.16 q4, [r0], #8
+; CHECK-NEXT: vstrb.16 q4, [r0], #8
; CHECK-NEXT: vidup.u16 q4, r6, #4
-; CHECK-NEXT: le lr, .LBB0_3
+; CHECK-NEXT: letp lr, .LBB0_3
; CHECK-NEXT: @ %bb.4: @ %do.end
; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: add.w r0, r11, #1
``````````
</details>
https://github.com/llvm/llvm-project/pull/94239
More information about the llvm-commits
mailing list