[llvm-branch-commits] [llvm] 0e49a40 - [ARM] Cleanup for the MVETailPrediction pass
David Green via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Nov 26 07:24:11 PST 2020
Author: David Green
Date: 2020-11-26T15:10:44Z
New Revision: 0e49a40d756b4487aebea436f8f84411c1a629e7
URL: https://github.com/llvm/llvm-project/commit/0e49a40d756b4487aebea436f8f84411c1a629e7
DIFF: https://github.com/llvm/llvm-project/commit/0e49a40d756b4487aebea436f8f84411c1a629e7.diff
LOG: [ARM] Cleanup for the MVETailPrediction pass
This strips out a lot of the code that should no longer be needed from
the MVETailPredictionPass, leaving the important part - find active lane
mask instructions and convert them to VCTP operations.
Differential Revision: https://reviews.llvm.org/D91866
Added:
Modified:
llvm/lib/Target/ARM/MVETailPredication.cpp
llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
llvm/test/CodeGen/Thumb2/active_lane_mask.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index 25d5fd7e69c6..8055b5cf500d 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -22,23 +22,13 @@
/// The HardwareLoops pass inserts intrinsics identifying loops that the
/// backend will attempt to convert into a low-overhead loop. The vectorizer is
/// responsible for generating a vectorized loop in which the lanes are
-/// predicated upon the iteration counter. This pass looks at these predicated
-/// vector loops, that are targets for low-overhead loops, and prepares it for
-/// code generation. Once the vectorizer has produced a masked loop, there's a
-/// couple of final forms:
-/// - A tail-predicated loop, with implicit predication.
-/// - A loop containing multiple VCPT instructions, predicating multiple VPT
-/// blocks of instructions operating on
diff erent vector types.
-///
-/// This pass:
-/// 1) Checks if the predicates of the masked load/store instructions are
-/// generated by intrinsic @llvm.get.active.lanes(). This intrinsic consumes
-/// the the scalar loop tripcount as its second argument, which we extract
-/// to set up the number of elements processed by the loop.
-/// 2) Intrinsic @llvm.get.active.lanes() is then replaced by the MVE target
-/// specific VCTP intrinsic to represent the effect of tail predication.
-/// This will be picked up by the ARM Low-overhead loop pass, which performs
-/// the final transformation to a DLSTP or WLSTP tail-predicated loop.
+/// predicated upon an get.active.lane.mask intrinsic. This pass looks at these
+/// get.active.lane.mask intrinsic and attempts to convert them to VCTP
+/// instructions. This will be picked up by the ARM Low-overhead loop pass later
+/// in the backend, which performs the final transformation to a DLSTP or WLSTP
+/// tail-predicated loop.
+//
+//===----------------------------------------------------------------------===//
#include "ARM.h"
#include "ARMSubtarget.h"
@@ -57,6 +47,7 @@
#include "llvm/InitializePasses.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
@@ -112,23 +103,18 @@ class MVETailPredication : public LoopPass {
bool runOnLoop(Loop *L, LPPassManager&) override;
private:
- /// Perform the relevant checks on the loop and convert if possible.
- bool TryConvert(Value *TripCount);
-
- /// Return whether this is a vectorized loop, that contains masked
- /// load/stores.
- bool IsPredicatedVectorLoop();
+ /// Perform the relevant checks on the loop and convert active lane masks if
+ /// possible.
+ bool TryConvertActiveLaneMask(Value *TripCount);
/// Perform several checks on the arguments of @llvm.get.active.lane.mask
/// intrinsic. E.g., check that the loop induction variable and the element
/// count are of the form we expect, and also perform overflow checks for
/// the new expressions that are created.
- bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount,
- FixedVectorType *VecTy);
+ bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount);
/// Insert the intrinsic to represent the effect of tail predication.
- void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount,
- FixedVectorType *VecTy);
+ void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount);
/// Rematerialize the iteration count in exit blocks, which enables
/// ARMLowOverheadLoops to better optimise away loop update statements inside
@@ -138,25 +124,6 @@ class MVETailPredication : public LoopPass {
} // end namespace
-static bool IsDecrement(Instruction &I) {
- auto *Call = dyn_cast<IntrinsicInst>(&I);
- if (!Call)
- return false;
-
- Intrinsic::ID ID = Call->getIntrinsicID();
- return ID == Intrinsic::loop_decrement_reg;
-}
-
-static bool IsMasked(Instruction *I) {
- auto *Call = dyn_cast<IntrinsicInst>(I);
- if (!Call)
- return false;
-
- Intrinsic::ID ID = Call->getIntrinsicID();
- return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load ||
- isGatherScatter(Call);
-}
-
bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
if (skipLoop(L) || !EnableTailPredication)
return false;
@@ -207,147 +174,11 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
return false;
}
- // Search for the hardware loop intrinic that decrements the loop counter.
- IntrinsicInst *Decrement = nullptr;
- for (auto *BB : L->getBlocks()) {
- for (auto &I : *BB) {
- if (IsDecrement(I)) {
- Decrement = cast<IntrinsicInst>(&I);
- break;
- }
- }
- }
-
- if (!Decrement)
- return false;
-
- LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
- << *Decrement << "\n");
-
- if (!TryConvert(Setup->getArgOperand(0))) {
- LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n");
- return false;
- }
-
- return true;
-}
-
-static FixedVectorType *getVectorType(IntrinsicInst *I) {
- unsigned ID = I->getIntrinsicID();
- FixedVectorType *VecTy;
- if (ID == Intrinsic::masked_load || isGather(I)) {
- if (ID == Intrinsic::arm_mve_vldr_gather_base_wb ||
- ID == Intrinsic::arm_mve_vldr_gather_base_wb_predicated)
- // then the type is a StructType
- VecTy = dyn_cast<FixedVectorType>(I->getType()->getContainedType(0));
- else
- VecTy = dyn_cast<FixedVectorType>(I->getType());
- } else if (ID == Intrinsic::masked_store) {
- VecTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
- } else {
- VecTy = dyn_cast<FixedVectorType>(I->getOperand(2)->getType());
- }
- assert(VecTy && "No scalable vectors expected here");
- return VecTy;
-}
-
-bool MVETailPredication::IsPredicatedVectorLoop() {
- // Check that the loop contains at least one masked load/store intrinsic.
- // We only support 'normal' vector instructions - other than masked
- // load/stores.
- bool ActiveLaneMask = false;
- for (auto *BB : L->getBlocks()) {
- for (auto &I : *BB) {
- auto *Int = dyn_cast<IntrinsicInst>(&I);
- if (!Int)
- continue;
-
- switch (Int->getIntrinsicID()) {
- case Intrinsic::get_active_lane_mask:
- ActiveLaneMask = true;
- continue;
- case Intrinsic::sadd_sat:
- case Intrinsic::uadd_sat:
- case Intrinsic::ssub_sat:
- case Intrinsic::usub_sat:
- case Intrinsic::vector_reduce_add:
- continue;
- case Intrinsic::fma:
- case Intrinsic::trunc:
- case Intrinsic::rint:
- case Intrinsic::round:
- case Intrinsic::floor:
- case Intrinsic::ceil:
- case Intrinsic::fabs:
- if (ST->hasMVEFloatOps())
- continue;
- break;
- default:
- break;
- }
- if (IsMasked(&I)) {
- auto *VecTy = getVectorType(Int);
- unsigned Lanes = VecTy->getNumElements();
- unsigned ElementWidth = VecTy->getScalarSizeInBits();
- // MVE vectors are 128-bit, but don't support 128 x i1.
- // TODO: Can we support vectors larger than 128-bits?
- unsigned MaxWidth = TTI->getRegisterBitWidth(true);
- if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth)
- return false;
- MaskedInsts.push_back(cast<IntrinsicInst>(&I));
- continue;
- }
-
- for (const Use &U : Int->args()) {
- if (isa<VectorType>(U->getType()))
- return false;
- }
- }
- }
-
- if (!ActiveLaneMask) {
- LLVM_DEBUG(dbgs() << "ARM TP: No get.active.lane.mask intrinsic found.\n");
- return false;
- }
- return !MaskedInsts.empty();
-}
-
-// Look through the exit block to see whether there's a duplicate predicate
-// instruction. This can happen when we need to perform a select on values
-// from the last and previous iteration. Instead of doing a straight
-// replacement of that predicate with the vctp, clone the vctp and place it
-// in the block. This means that the VPR doesn't have to be live into the
-// exit block which should make it easier to convert this loop into a proper
-// tail predicated loop.
-static void Cleanup(SetVector<Instruction*> &MaybeDead, Loop *L) {
- BasicBlock *Exit = L->getUniqueExitBlock();
- if (!Exit) {
- LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n");
- return;
- }
-
- // Drop references and add operands to check for dead.
- SmallPtrSet<Instruction*, 4> Dead;
- while (!MaybeDead.empty()) {
- auto *I = MaybeDead.front();
- MaybeDead.remove(I);
- if (I->hasNUsesOrMore(1))
- continue;
+ LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n");
- for (auto &U : I->operands())
- if (auto *OpI = dyn_cast<Instruction>(U))
- MaybeDead.insert(OpI);
+ bool Changed = TryConvertActiveLaneMask(Setup->getArgOperand(0));
- Dead.insert(I);
- }
-
- for (auto *I : Dead) {
- LLVM_DEBUG(dbgs() << "ARM TP: removing dead insn: "; I->dump());
- I->eraseFromParent();
- }
-
- for (auto I : L->blocks())
- DeleteDeadPHIs(I);
+ return Changed;
}
// The active lane intrinsic has this form:
@@ -368,7 +199,7 @@ static void Cleanup(SetVector<Instruction*> &MaybeDead, Loop *L) {
// 3) The IV must be an induction phi with an increment equal to the
// vector width.
bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
- Value *TripCount, FixedVectorType *VecTy) {
+ Value *TripCount) {
bool ForceTailPredication =
EnableTailPredication == TailPredication::ForceEnabledNoReductions ||
EnableTailPredication == TailPredication::ForceEnabled;
@@ -376,7 +207,10 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
Value *ElemCount = ActiveLaneMask->getOperand(1);
auto *EC= SE->getSCEV(ElemCount);
auto *TC = SE->getSCEV(TripCount);
- int VectorWidth = VecTy->getNumElements();
+ int VectorWidth =
+ cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements();
+ if (VectorWidth != 4 && VectorWidth != 8 && VectorWidth != 16)
+ return false;
ConstantInt *ConstElemCount = nullptr;
// 1) Smoke tests that the original scalar loop TripCount (TC) belongs to
@@ -503,21 +337,22 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
if (VectorWidth == StepValue)
return true;
- LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue << " doesn't match "
- "vector width " << VectorWidth << "\n");
+ LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue
+ << " doesn't match vector width " << VectorWidth << "\n");
return false;
}
void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
- Value *TripCount, FixedVectorType *VecTy) {
+ Value *TripCount) {
IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
Module *M = L->getHeader()->getModule();
Type *Ty = IntegerType::get(M->getContext(), 32);
- unsigned VectorWidth = VecTy->getNumElements();
+ unsigned VectorWidth =
+ cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements();
// Insert a phi to count the number of elements processed by the loop.
- Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI() );
+ Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI());
PHINode *Processed = Builder.CreatePHI(Ty, 2);
Processed->addIncoming(ActiveLaneMask->getOperand(1), L->getLoopPreheader());
@@ -553,50 +388,36 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
<< "ARM TP: Inserted VCTP: " << *VCTPCall << "\n");
}
-bool MVETailPredication::TryConvert(Value *TripCount) {
- if (!IsPredicatedVectorLoop()) {
- LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop.\n");
+bool MVETailPredication::TryConvertActiveLaneMask(Value *TripCount) {
+ SmallVector<IntrinsicInst *, 4> ActiveLaneMasks;
+ for (auto *BB : L->getBlocks())
+ for (auto &I : *BB)
+ if (auto *Int = dyn_cast<IntrinsicInst>(&I))
+ if (Int->getIntrinsicID() == Intrinsic::get_active_lane_mask)
+ ActiveLaneMasks.push_back(Int);
+
+ if (ActiveLaneMasks.empty())
return false;
- }
LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n");
- SetVector<Instruction*> Predicates;
-
- auto getPredicateOp = [](IntrinsicInst *I) {
- unsigned IntrinsicID = I->getIntrinsicID();
- if (IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated ||
- IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset_predicated)
- return 5;
- return (IntrinsicID == Intrinsic::masked_load || isGather(I)) ? 2 : 3;
- };
-
- // Walk through the masked intrinsics and try to find whether the predicate
- // operand is generated by intrinsic @llvm.get.active.lane.mask().
- for (auto *I : MaskedInsts) {
- Value *PredOp = I->getArgOperand(getPredicateOp(I));
- auto *Predicate = dyn_cast<Instruction>(PredOp);
- if (!Predicate || Predicates.count(Predicate))
- continue;
- auto *ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate);
- if (!ActiveLaneMask ||
- ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask)
- continue;
-
- Predicates.insert(Predicate);
+ for (auto *ActiveLaneMask : ActiveLaneMasks) {
LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: "
<< *ActiveLaneMask << "\n");
- auto *VecTy = getVectorType(I);
- if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) {
+ if (!IsSafeActiveMask(ActiveLaneMask, TripCount)) {
LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n");
return false;
}
LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n");
- InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy);
+ InsertVCTPIntrinsic(ActiveLaneMask, TripCount);
}
- Cleanup(Predicates, L);
+ // Remove dead instructions and now dead phis.
+ for (auto *II : ActiveLaneMasks)
+ RecursivelyDeleteTriviallyDeadInstructions(II);
+ for (auto I : L->blocks())
+ DeleteDeadPHIs(I);
return true;
}
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
index 5c97f1b953cf..bd927fdcf859 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
@@ -241,42 +241,18 @@ define arm_aapcs_vfpcc void @nearbyint(float* noalias nocapture readonly %pSrcA,
; CHECK-NEXT: it eq
; CHECK-NEXT: popeq {r7, pc}
; CHECK-NEXT: .LBB5_1: @ %vector.ph
-; CHECK-NEXT: adds r3, r2, #3
-; CHECK-NEXT: vdup.32 q1, r2
-; CHECK-NEXT: bic r3, r3, #3
-; CHECK-NEXT: sub.w r12, r3, #4
-; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: add.w lr, r3, r12, lsr #2
-; CHECK-NEXT: adr r3, .LCPI5_0
-; CHECK-NEXT: dls lr, lr
-; CHECK-NEXT: vldrw.u32 q0, [r3]
-; CHECK-NEXT: mov.w r12, #0
+; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB5_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vadd.i32 q2, q0, r12
-; CHECK-NEXT: vdup.32 q3, r12
-; CHECK-NEXT: vcmp.u32 hi, q3, q2
-; CHECK-NEXT: add.w r12, r12, #4
-; CHECK-NEXT: vpnot
-; CHECK-NEXT: vpstt
-; CHECK-NEXT: vcmpt.u32 hi, q1, q2
-; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
-; CHECK-NEXT: vrintr.f32 s15, s11
-; CHECK-NEXT: vrintr.f32 s14, s10
-; CHECK-NEXT: vrintr.f32 s13, s9
-; CHECK-NEXT: vrintr.f32 s12, s8
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vstrwt.32 q3, [r1], #16
-; CHECK-NEXT: le lr, .LBB5_2
+; CHECK-NEXT: vldrw.u32 q0, [r0], #16
+; CHECK-NEXT: vrintr.f32 s7, s3
+; CHECK-NEXT: vrintr.f32 s6, s2
+; CHECK-NEXT: vrintr.f32 s5, s1
+; CHECK-NEXT: vrintr.f32 s4, s0
+; CHECK-NEXT: vstrw.32 q1, [r1], #16
+; CHECK-NEXT: letp lr, .LBB5_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
-; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: @ %bb.4:
-; CHECK-NEXT: .LCPI5_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 1 @ 0x1
-; CHECK-NEXT: .long 2 @ 0x2
-; CHECK-NEXT: .long 3 @ 0x3
entry:
%cmp5 = icmp eq i32 %n, 0
br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
index 2a5d32013d47..729493163b81 100644
--- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
+++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
@@ -410,7 +410,156 @@ define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) {
ret <16 x i8> %select
}
+define void @test_width2(i32* nocapture readnone %x, i32* nocapture %y, i8 zeroext %m) {
+; CHECK-LABEL: test_width2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: sub sp, #8
+; CHECK-NEXT: cmp r2, #0
+; CHECK-NEXT: beq.w .LBB4_3
+; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT: adds r0, r2, #1
+; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: bic r0, r0, #1
+; CHECK-NEXT: vmov.32 q2[0], r2
+; CHECK-NEXT: subs r0, #2
+; CHECK-NEXT: vmov.i64 q0, #0xffffffff
+; CHECK-NEXT: vmov.32 q2[2], r2
+; CHECK-NEXT: movs r6, #0
+; CHECK-NEXT: add.w lr, r3, r0, lsr #1
+; CHECK-NEXT: adr r3, .LCPI4_0
+; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: vldrw.u32 q1, [r3]
+; CHECK-NEXT: vand q2, q2, q0
+; CHECK-NEXT: .LBB4_2: @ %vector.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vmov.32 q3[0], r6
+; CHECK-NEXT: vmov r5, s8
+; CHECK-NEXT: vmov.32 q3[2], r6
+; CHECK-NEXT: vmov r0, s9
+; CHECK-NEXT: vand q3, q3, q0
+; CHECK-NEXT: adds r6, #2
+; CHECK-NEXT: vmov r3, s14
+; CHECK-NEXT: vmov r2, s15
+; CHECK-NEXT: adds r3, #1
+; CHECK-NEXT: adc r12, r2, #0
+; CHECK-NEXT: vmov r2, s12
+; CHECK-NEXT: vmov.32 q3[0], r2
+; CHECK-NEXT: vmov.32 q3[2], r3
+; CHECK-NEXT: vand q3, q3, q0
+; CHECK-NEXT: vmov r4, s12
+; CHECK-NEXT: teq.w r4, r2
+; CHECK-NEXT: cset r2, ne
+; CHECK-NEXT: tst.w r2, #1
+; CHECK-NEXT: csetm r2, ne
+; CHECK-NEXT: vmov.32 q4[0], r2
+; CHECK-NEXT: vmov.32 q4[1], r2
+; CHECK-NEXT: vmov r2, s14
+; CHECK-NEXT: eors r3, r2
+; CHECK-NEXT: orrs.w r3, r3, r12
+; CHECK-NEXT: cset r3, ne
+; CHECK-NEXT: tst.w r3, #1
+; CHECK-NEXT: csetm r3, ne
+; CHECK-NEXT: subs r5, r4, r5
+; CHECK-NEXT: vmov.32 q4[2], r3
+; CHECK-NEXT: vmov r5, s10
+; CHECK-NEXT: vmov.32 q4[3], r3
+; CHECK-NEXT: vmov r3, s13
+; CHECK-NEXT: veor q4, q4, q1
+; CHECK-NEXT: sbcs.w r0, r3, r0
+; CHECK-NEXT: vmov r3, s11
+; CHECK-NEXT: mov.w r0, #0
+; CHECK-NEXT: it lo
+; CHECK-NEXT: movlo r0, #1
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: csetm r0, ne
+; CHECK-NEXT: subs r2, r2, r5
+; CHECK-NEXT: vmov.32 q5[0], r0
+; CHECK-NEXT: vmov.32 q5[1], r0
+; CHECK-NEXT: vmov r0, s15
+; CHECK-NEXT: @ implicit-def: $q3
+; CHECK-NEXT: sbcs r0, r3
+; CHECK-NEXT: mov.w r0, #0
+; CHECK-NEXT: it lo
+; CHECK-NEXT: movlo r0, #1
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: csetm r0, ne
+; CHECK-NEXT: vmov.32 q5[2], r0
+; CHECK-NEXT: vmov.32 q5[3], r0
+; CHECK-NEXT: vand q4, q4, q5
+; CHECK-NEXT: vmov r2, s16
+; CHECK-NEXT: vmov r0, s18
+; CHECK-NEXT: and r2, r2, #1
+; CHECK-NEXT: orr.w r3, r2, r0, lsl #1
+; CHECK-NEXT: sub.w r2, r1, #8
+; CHECK-NEXT: lsls r0, r3, #31
+; CHECK-NEXT: itt ne
+; CHECK-NEXT: ldrne r0, [r2]
+; CHECK-NEXT: vmovne.32 q3[0], r0
+; CHECK-NEXT: and r0, r3, #3
+; CHECK-NEXT: lsls r0, r0, #30
+; CHECK-NEXT: itt mi
+; CHECK-NEXT: ldrmi r0, [r2, #4]
+; CHECK-NEXT: vmovmi.32 q3[2], r0
+; CHECK-NEXT: vmov r2, s16
+; CHECK-NEXT: vmov r0, s18
+; CHECK-NEXT: and r2, r2, #1
+; CHECK-NEXT: orr.w r2, r2, r0, lsl #1
+; CHECK-NEXT: lsls r0, r2, #31
+; CHECK-NEXT: itt ne
+; CHECK-NEXT: vmovne r0, s12
+; CHECK-NEXT: strne r0, [r1]
+; CHECK-NEXT: and r0, r2, #3
+; CHECK-NEXT: lsls r0, r0, #30
+; CHECK-NEXT: itt mi
+; CHECK-NEXT: vmovmi r0, s14
+; CHECK-NEXT: strmi r0, [r1, #4]
+; CHECK-NEXT: adds r1, #8
+; CHECK-NEXT: le lr, .LBB4_2
+; CHECK-NEXT: .LBB4_3: @ %for.cond.cleanup
+; CHECK-NEXT: add sp, #8
+; CHECK-NEXT: vpop {d8, d9, d10, d11}
+; CHECK-NEXT: pop {r4, r5, r6, pc}
+; CHECK-NEXT: .p2align 4
+; CHECK-NEXT: @ %bb.4:
+; CHECK-NEXT: .LCPI4_0:
+; CHECK-NEXT: .long 1 @ 0x1
+; CHECK-NEXT: .long 0 @ 0x0
+; CHECK-NEXT: .long 1 @ 0x1
+; CHECK-NEXT: .long 0 @ 0x0
+entry:
+ %cmp9.not = icmp eq i8 %m, 0
+ br i1 %cmp9.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i8 %m to i32
+ %n.rnd.up = add nuw nsw i32 %wide.trip.count, 1
+ %n.vec = and i32 %n.rnd.up, 510
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %for.body.preheader
+ %index = phi i32 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+ %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 %index, i32 %wide.trip.count)
+ %0 = add nsw i32 %index, -2
+ %1 = getelementptr inbounds i32, i32* %y, i32 %0
+ %2 = bitcast i32* %1 to <2 x i32>*
+ %wide.masked.load = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %2, i32 4, <2 x i1> %active.lane.mask, <2 x i32> undef)
+ %3 = getelementptr inbounds i32, i32* %y, i32 %index
+ %4 = bitcast i32* %3 to <2 x i32>*
+ call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %wide.masked.load, <2 x i32>* %4, i32 4, <2 x i1> %active.lane.mask)
+ %index.next = add i32 %index, 2
+ %5 = icmp eq i32 %index.next, %n.vec
+ br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body, %entry
+ ret void
+}
+
+declare <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32, i32)
declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
declare <7 x i1> @llvm.get.active.lane.mask.v7i1.i32(i32, i32)
declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
+declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
+declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
More information about the llvm-branch-commits
mailing list