[llvm] 2668775 - [LSR][ARM] Add new TTI hook to mark some LSR chains as profitable

Wed May 13 06:20:50 PDT 2020

Author: Pierre-vh
Date: 2020-05-13T14:18:28+01:00
New Revision: 2668775f66656f719f7d8164066ec5ca64d707f2

URL: https://github.com/llvm/llvm-project/commit/2668775f66656f719f7d8164066ec5ca64d707f2
DIFF: https://github.com/llvm/llvm-project/commit/2668775f66656f719f7d8164066ec5ca64d707f2.diff

LOG: [LSR][ARM] Add new TTI hook to mark some LSR chains as profitable

This patch adds a new TTI hook to allow targets to tell LSR that
a chain including some instruction is already profitable and
should not be optimized. This patch also adds an implementation
of this TTI hook for ARM so LSR doesn't optimize chains that include
the VCTP intrinsic.

Differential Revision: https://reviews.llvm.org/D79418

Added: 
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll
    llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains.ll

Modified: 
    llvm/include/llvm/Analysis/TargetTransformInfo.h
    llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
    llvm/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/lib/Analysis/TargetTransformInfo.cpp
    llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
    llvm/lib/Target/ARM/ARMTargetTransformInfo.h
    llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f3e57567b6bd..e5f2c53c910f 100644

--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -519,6 +519,9 @@ class TargetTransformInfo {
   bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                      TargetTransformInfo::LSRCost &C2) const;
 
+  /// \returns true if LSR should not optimize a chain that includes \p I.
+  bool isProfitableLSRChainElement(Instruction *I) const;
+
   /// Return true if the target can fuse a compare and branch.
   /// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost
   /// calculation for the instructions in a loop.
@@ -1233,6 +1236,7 @@ class TargetTransformInfo::Concept {
                                      Instruction *I) = 0;
   virtual bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                              TargetTransformInfo::LSRCost &C2) = 0;
+  virtual bool isProfitableLSRChainElement(Instruction *I) = 0;
   virtual bool canMacroFuseCmp() = 0;
   virtual bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
                           LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC,
@@ -1542,6 +1546,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
                      TargetTransformInfo::LSRCost &C2) override {
     return Impl.isLSRCostLess(C1, C2);
   }
+  bool isProfitableLSRChainElement(Instruction *I) override {
+    return Impl.isProfitableLSRChainElement(I);
+  }
   bool canMacroFuseCmp() override { return Impl.canMacroFuseCmp(); }
   bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,
                   DominatorTree *DT, AssumptionCache *AC,

diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 372a254b64c5..669e4b4f3ff6 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -166,6 +166,8 @@ class TargetTransformInfoImplBase {
                     C2.ScaleCost, C2.ImmCost, C2.SetupCost);
   }
 
+  bool isProfitableLSRChainElement(Instruction *I) { return false; }
+
   bool canMacroFuseCmp() { return false; }
 
   bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI,

diff  --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 5a066448b156..0d972d72ae72 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -262,6 +262,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
   }
 
+  bool isProfitableLSRChainElement(Instruction *I) {
+    return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
+  }
+
   int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                            bool HasBaseReg, int64_t Scale, unsigned AddrSpace) {
     TargetLoweringBase::AddrMode AM;

diff  --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 95b17aa702d0..cda3d16b9526 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -261,6 +261,10 @@ bool TargetTransformInfo::isLSRCostLess(LSRCost &C1, LSRCost &C2) const {
   return TTIImpl->isLSRCostLess(C1, C2);
 }
 
+bool TargetTransformInfo::isProfitableLSRChainElement(Instruction *I) const {
+  return TTIImpl->isProfitableLSRChainElement(I);
+}
+
 bool TargetTransformInfo::canMacroFuseCmp() const {
   return TTIImpl->canMacroFuseCmp();
 }

diff  --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index c90429f87d1b..3864e2894172 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/SubtargetFeature.h"
@@ -550,6 +551,23 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
 }
 
+bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    // If a VCTP is part of a chain, it's already profitable and shouldn't be
+    // optimized, else LSR may block tail-predication.
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::arm_mve_vctp8:
+    case Intrinsic::arm_mve_vctp16:
+    case Intrinsic::arm_mve_vctp32:
+    case Intrinsic::arm_mve_vctp64:
+      return true;
+    default:
+      break;
+    }
+  }
+  return false;
+}
+
 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
   if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
     return false;

diff  --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 9cb4916b7002..d6efc6e7ae9e 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -151,6 +151,8 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
     return ST->getMaxInterleaveFactor();
   }
 
+  bool isProfitableLSRChainElement(Instruction *I);
+
   bool isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment);
 
   bool isLegalMaskedStore(Type *DataTy, MaybeAlign Alignment) {

diff  --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 652ff6bfb6d8..d8a272a7270e 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -2820,9 +2820,10 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
 /// increments can be computed in fewer registers when chained.
 ///
 /// TODO: Consider IVInc free if it's already used in another chains.
-static bool
-isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
-                  ScalarEvolution &SE) {
+static bool isProfitableChain(IVChain &Chain,
+                              SmallPtrSetImpl<Instruction *> &Users,
+                              ScalarEvolution &SE,
+                              const TargetTransformInfo &TTI) {
   if (StressIVChain)
     return true;
 
@@ -2851,7 +2852,14 @@ isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
   unsigned NumConstIncrements = 0;
   unsigned NumVarIncrements = 0;
   unsigned NumReusedIncrements = 0;
+
+  if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
+    return true;
+
   for (const IVInc &Inc : Chain) {
+    if (TTI.isProfitableLSRChainElement(Inc.UserInst))
+      return true;
+
     if (Inc.IncExpr->isZero())
       continue;
 
@@ -3082,7 +3090,7 @@ void LSRInstance::CollectChains() {
   for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
        UsersIdx < NChains; ++UsersIdx) {
     if (!isProfitableChain(IVChainVec[UsersIdx],
-                           ChainUsersVec[UsersIdx].FarUsers, SE))
+                           ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
       continue;
     // Preserve the chain at UsesIdx.
     if (ChainIdx != UsersIdx)

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll
new file mode 100644
index 000000000000..bc2c7e084ea7
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 -disable-mve-tail-predication=false -mtriple=thumbv8.1m.main -mattr=+mve,+mve.fp %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m-arm-none-eabi"
+
+; Tests that LSR will not interfere with the VCTP intrinsic,
+; and that this loop will correctly become tail-predicated.
+
+define arm_aapcs_vfpcc float @vctpi32(float* %0, i32 %1) {
+; CHECK-LABEL: vctpi32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    vmvn.i32 q1, #0x1f
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    vadd.i32 q1, q3, q1
+; CHECK-NEXT:    subs r3, r1, #1
+; CHECK-NEXT:    vidup.u32 q2, r2, #8
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vadd.i32 q1, q2, r0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    dlstp.32 lr, r3
+; CHECK-NEXT:  .LBB0_1: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q2, [q1, #32]!
+; CHECK-NEXT:    vadd.f32 q0, q0, q2
+; CHECK-NEXT:    letp lr, .LBB0_1
+; CHECK-NEXT:  @ %bb.2:
+; CHECK-NEXT:    bl vecAddAcrossF32Mve
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vcvt.f32.s32 s0, s0
+; CHECK-NEXT:    vabs.f32 s0, s0
+; CHECK-NEXT:    pop {r7, pc}
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = add nsw i32 %1, -1
+  %6 = ptrtoint float* %0 to i32
+  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
+  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %10 = add <4 x i32> %4, %9
+  br label %11
+
+11:
+  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
+  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
+  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
+  %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12)
+  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15)
+  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
+  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
+  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13)
+  %20 = add nsw i32 %12, -4
+  %21 = icmp sgt i32 %12, 4
+  br i1 %21, label %11, label %22
+
+22:
+  %23 = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fabs.f32(float %24)
+  ret float %25
+}
+
+declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32)
+declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)
+declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
+declare arm_aapcs_vfpcc i32 @vecAddAcrossF32Mve(...)
+declare float @llvm.fabs.f32(float)

diff  --git a/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains.ll
new file mode 100644
index 000000000000..08b24383a173
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains.ll
@@ -0,0 +1,257 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=thumbv8.1m.main -mattr=+mve %s -S -loop-reduce -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m-arm-none-eabi"
+
+define float @vctp8(float* %0, i32 %1) {
+; CHECK-LABEL: @vctp8(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    br label [[TMP11:%.*]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 [[TMP12]])
+; CHECK-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @v16i1_to_v4i1(<16 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
+; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[MASK]], <4 x float> [[TMP13]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
+; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
+; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP19]])
+; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
+; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
+; CHECK-NEXT:    ret float [[TMP25]]
+;
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = add nsw i32 %1, -1
+  %6 = ptrtoint float* %0 to i32
+  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
+  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %10 = add <4 x i32> %4, %9
+  br label %11
+
+11:                                               ; preds = %11, %2
+  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
+  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
+  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
+  %15 = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 %12)
+  %mask = tail call <4 x i1> @v16i1_to_v4i1(<16 x i1> %15)
+  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %mask)
+  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
+  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
+  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %mask, <4 x float> %13)
+  %20 = add nsw i32 %12, -4
+  %21 = icmp sgt i32 %12, 4
+  br i1 %21, label %11, label %22
+
+22:                                               ; preds = %11
+  %23 = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fabs.f32(float %24)
+  ret float %25
+}
+
+define float @vctp16(float* %0, i32 %1) {
+; CHECK-LABEL: @vctp16(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    br label [[TMP11:%.*]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP12]])
+; CHECK-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @v8i1_to_v4i1(<8 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
+; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[MASK]], <4 x float> [[TMP13]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
+; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
+; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP19]])
+; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
+; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
+; CHECK-NEXT:    ret float [[TMP25]]
+;
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = add nsw i32 %1, -1
+  %6 = ptrtoint float* %0 to i32
+  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
+  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %10 = add <4 x i32> %4, %9
+  br label %11
+
+11:                                               ; preds = %11, %2
+  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
+  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
+  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
+  %15 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %12)
+  %mask = tail call <4 x i1> @v8i1_to_v4i1(<8 x i1> %15)
+  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %mask)
+  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
+  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
+  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %mask, <4 x float> %13)
+  %20 = add nsw i32 %12, -4
+  %21 = icmp sgt i32 %12, 4
+  br i1 %21, label %11, label %22
+
+22:                                               ; preds = %11
+  %23 = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fabs.f32(float %24)
+  ret float %25
+}
+
+define float @vctpi32(float* %0, i32 %1) {
+; CHECK-LABEL: @vctpi32(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    br label [[TMP11:%.*]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP12]])
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
+; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[TMP15]], <4 x float> [[TMP13]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
+; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
+; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP19]])
+; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
+; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
+; CHECK-NEXT:    ret float [[TMP25]]
+;
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = add nsw i32 %1, -1
+  %6 = ptrtoint float* %0 to i32
+  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
+  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %10 = add <4 x i32> %4, %9
+  br label %11
+
+11:                                               ; preds = %11, %2
+  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
+  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
+  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
+  %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12)
+  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15)
+  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
+  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
+  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13)
+  %20 = add nsw i32 %12, -4
+  %21 = icmp sgt i32 %12, 4
+  br i1 %21, label %11, label %22
+
+22:                                               ; preds = %11
+  %23 = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fabs.f32(float %24)
+  ret float %25
+}
+
+
+define float @vctpi64(float* %0, i32 %1) {
+; CHECK-LABEL: @vctpi64(
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    br label [[TMP11:%.*]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <4 x i1> @llvm.arm.mve.vctp64(i32 [[TMP12]])
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
+; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[TMP15]], <4 x float> [[TMP13]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
+; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
+; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP19]])
+; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
+; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
+; CHECK-NEXT:    ret float [[TMP25]]
+;
+  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
+  %4 = extractvalue { <4 x i32>, i32 } %3, 0
+  %5 = add nsw i32 %1, -1
+  %6 = ptrtoint float* %0 to i32
+  %7 = insertelement <4 x i32> undef, i32 %6, i32 0
+  %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
+  %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %10 = add <4 x i32> %4, %9
+  br label %11
+
+11:                                               ; preds = %11, %2
+  %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
+  %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
+  %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
+  %15 = tail call <4 x i1> @llvm.arm.mve.vctp64(i32 %12)
+  %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15)
+  %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
+  %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
+  %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13)
+  %20 = add nsw i32 %12, -4
+  %21 = icmp sgt i32 %12, 4
+  br i1 %21, label %11, label %22
+
+22:                                               ; preds = %11
+  %23 = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fabs.f32(float %24)
+  ret float %25
+}
+
+declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32)
+declare <16 x i1> @llvm.arm.mve.vctp8(i32)
+declare <8 x i1> @llvm.arm.mve.vctp16(i32)
+declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+declare <4 x i1> @llvm.arm.mve.vctp64(i32)
+declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)
+declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
+declare i32 @vecAddAcrossF32Mve(...)
+declare <4 x i1> @v8i1_to_v4i1(<8 x i1>)
+declare <4 x i1> @v16i1_to_v4i1(<16 x i1>)
+declare float @llvm.fabs.f32(float)