[llvm] 191f9fa - [AArch64][SVE] Move instcombine like transforms out of SVEIntrinsicOpts

Tue Jul 20 07:18:00 PDT 2021

Author: Bradley Smith
Date: 2021-07-20T14:17:30Z
New Revision: 191f9fa5d2af2b01aa23a591e780a1c36a014a89

URL: https://github.com/llvm/llvm-project/commit/191f9fa5d2af2b01aa23a591e780a1c36a014a89
DIFF: https://github.com/llvm/llvm-project/commit/191f9fa5d2af2b01aa23a591e780a1c36a014a89.diff

LOG: [AArch64][SVE] Move instcombine like transforms out of SVEIntrinsicOpts

Instead move them to the instcombine that happens in AArch64TargetTransformInfo.

Differential Revision: https://reviews.llvm.org/D106144

Added: 
    llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
    llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
    llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-tbl-dupx.ll
    llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-ptest.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
    llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp

Removed: 
    llvm/test/CodeGen/AArch64/sve-fmul-idempotency.ll
    llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll
    llvm/test/CodeGen/AArch64/sve-mul-idempotency.ll
    llvm/test/CodeGen/AArch64/sve-tbl-dupx.ll


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5f1b28a8f9a3b..4ab754465f39c 100644

--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -686,6 +686,115 @@ instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
              : None;
 }
 
+static Optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
+                                                   IntrinsicInst &II) {
+  IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
+  IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
+
+  if (Op1 && Op2 &&
+      Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
+      Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
+      Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
+
+    IRBuilder<> Builder(II.getContext());
+    Builder.SetInsertPoint(&II);
+
+    Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
+    Type *Tys[] = {Op1->getArgOperand(0)->getType()};
+
+    auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
+
+    PTest->takeName(&II);
+    return IC.replaceInstUsesWith(II, PTest);
+  }
+
+  return None;
+}
+
+static Optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
+                                                       IntrinsicInst &II) {
+  auto *OpPredicate = II.getOperand(0);
+  auto *OpMultiplicand = II.getOperand(1);
+  auto *OpMultiplier = II.getOperand(2);
+
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
+
+  // Return true if a given instruction is an aarch64_sve_dup_x intrinsic call
+  // with a unit splat value, false otherwise.
+  auto IsUnitDupX = [](auto *I) {
+    auto *IntrI = dyn_cast<IntrinsicInst>(I);
+    if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
+      return false;
+
+    auto *SplatValue = IntrI->getOperand(0);
+    return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
+  };
+
+  // Return true if a given instruction is an aarch64_sve_dup intrinsic call
+  // with a unit splat value, false otherwise.
+  auto IsUnitDup = [](auto *I) {
+    auto *IntrI = dyn_cast<IntrinsicInst>(I);
+    if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
+      return false;
+
+    auto *SplatValue = IntrI->getOperand(2);
+    return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
+  };
+
+  // The OpMultiplier variable should always point to the dup (if any), so
+  // swap if necessary.
+  if (IsUnitDup(OpMultiplicand) || IsUnitDupX(OpMultiplicand))
+    std::swap(OpMultiplier, OpMultiplicand);
+
+  if (IsUnitDupX(OpMultiplier)) {
+    // [f]mul pg (dupx 1) %n => %n
+    OpMultiplicand->takeName(&II);
+    return IC.replaceInstUsesWith(II, OpMultiplicand);
+  } else if (IsUnitDup(OpMultiplier)) {
+    // [f]mul pg (dup pg 1) %n => %n
+    auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
+    auto *DupPg = DupInst->getOperand(1);
+    // TODO: this is naive. The optimization is still valid if DupPg
+    // 'encompasses' OpPredicate, not only if they're the same predicate.
+    if (OpPredicate == DupPg) {
+      OpMultiplicand->takeName(&II);
+      return IC.replaceInstUsesWith(II, OpMultiplicand);
+    }
+  }
+
+  return None;
+}
+
+static Optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
+                                                 IntrinsicInst &II) {
+  auto *OpVal = II.getOperand(0);
+  auto *OpIndices = II.getOperand(1);
+  VectorType *VTy = cast<VectorType>(II.getType());
+
+  // Check whether OpIndices is an aarch64_sve_dup_x intrinsic call with
+  // constant splat value < minimal element count of result.
+  auto *DupXIntrI = dyn_cast<IntrinsicInst>(OpIndices);
+  if (!DupXIntrI || DupXIntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
+    return None;
+
+  auto *SplatValue = dyn_cast<ConstantInt>(DupXIntrI->getOperand(0));
+  if (!SplatValue ||
+      SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
+    return None;
+
+  // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
+  // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
+  auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
+  auto *VectorSplat =
+      Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
+
+  VectorSplat->takeName(&II);
+  return IC.replaceInstUsesWith(II, VectorSplat);
+}
+
 Optional<Instruction *>
 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
                                      IntrinsicInst &II) const {
@@ -713,6 +822,15 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
     return instCombineSVECntElts(IC, II, 8);
   case Intrinsic::aarch64_sve_cntb:
     return instCombineSVECntElts(IC, II, 16);
+  case Intrinsic::aarch64_sve_ptest_any:
+  case Intrinsic::aarch64_sve_ptest_first:
+  case Intrinsic::aarch64_sve_ptest_last:
+    return instCombineSVEPTest(IC, II);
+  case Intrinsic::aarch64_sve_mul:
+  case Intrinsic::aarch64_sve_fmul:
+    return instCombineSVEVectorMul(IC, II);
+  case Intrinsic::aarch64_sve_tbl:
+    return instCombineSVETBL(IC, II);
   }
 
   return None;

diff  --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
index 16bea0655511e..79dcca8f84587 100644
--- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
+++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -60,18 +60,9 @@ struct SVEIntrinsicOpts : public ModulePass {
                                    SmallSetVector<IntrinsicInst *, 4> &PTrues);
   bool optimizePTrueIntrinsicCalls(SmallSetVector<Function *, 4> &Functions);
 
-  /// Operates at the instruction-scope. I.e., optimizations are applied local
-  /// to individual instructions.
-  static bool optimizeIntrinsic(Instruction *I);
-  bool optimizeIntrinsicCalls(SmallSetVector<Function *, 4> &Functions);
-
   /// Operates at the function-scope. I.e., optimizations are applied local to
   /// the functions themselves.
   bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
-
-  static bool optimizePTest(IntrinsicInst *I);
-  static bool optimizeVectorMul(IntrinsicInst *I);
-  static bool optimizeTBL(IntrinsicInst *I);
 };
 } // end anonymous namespace
 
@@ -285,185 +276,11 @@ bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls(
   return Changed;
 }
 
-bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) {
-  IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(I->getArgOperand(0));
-  IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(I->getArgOperand(1));
-
-  if (Op1 && Op2 &&
-      Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
-      Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
-      Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
-
-    Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
-    Type *Tys[] = {Op1->getArgOperand(0)->getType()};
-    Module *M = I->getParent()->getParent()->getParent();
-
-    auto Fn = Intrinsic::getDeclaration(M, I->getIntrinsicID(), Tys);
-    auto CI = CallInst::Create(Fn, Ops, I->getName(), I);
-
-    I->replaceAllUsesWith(CI);
-    I->eraseFromParent();
-    if (Op1->use_empty())
-      Op1->eraseFromParent();
-    if (Op1 != Op2 && Op2->use_empty())
-      Op2->eraseFromParent();
-
-    return true;
-  }
-
-  return false;
-}
-
-bool SVEIntrinsicOpts::optimizeVectorMul(IntrinsicInst *I) {
-  assert((I->getIntrinsicID() == Intrinsic::aarch64_sve_mul ||
-          I->getIntrinsicID() == Intrinsic::aarch64_sve_fmul) &&
-         "Unexpected opcode");
-
-  auto *OpPredicate = I->getOperand(0);
-  auto *OpMultiplicand = I->getOperand(1);
-  auto *OpMultiplier = I->getOperand(2);
-
-  // Return true if a given instruction is an aarch64_sve_dup_x intrinsic call
-  // with a unit splat value, false otherwise.
-  auto IsUnitDupX = [](auto *I) {
-    auto *IntrI = dyn_cast<IntrinsicInst>(I);
-    if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
-      return false;
-
-    auto *SplatValue = IntrI->getOperand(0);
-    return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
-  };
-
-  // Return true if a given instruction is an aarch64_sve_dup intrinsic call
-  // with a unit splat value, false otherwise.
-  auto IsUnitDup = [](auto *I) {
-    auto *IntrI = dyn_cast<IntrinsicInst>(I);
-    if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
-      return false;
-
-    auto *SplatValue = IntrI->getOperand(2);
-    return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
-  };
-
-  bool Changed = true;
-
-  // The OpMultiplier variable should always point to the dup (if any), so
-  // swap if necessary.
-  if (IsUnitDup(OpMultiplicand) || IsUnitDupX(OpMultiplicand))
-    std::swap(OpMultiplier, OpMultiplicand);
-
-  if (IsUnitDupX(OpMultiplier)) {
-    // [f]mul pg (dupx 1) %n => %n
-    I->replaceAllUsesWith(OpMultiplicand);
-    I->eraseFromParent();
-    Changed = true;
-  } else if (IsUnitDup(OpMultiplier)) {
-    // [f]mul pg (dup pg 1) %n => %n
-    auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
-    auto *DupPg = DupInst->getOperand(1);
-    // TODO: this is naive. The optimization is still valid if DupPg
-    // 'encompasses' OpPredicate, not only if they're the same predicate.
-    if (OpPredicate == DupPg) {
-      I->replaceAllUsesWith(OpMultiplicand);
-      I->eraseFromParent();
-      Changed = true;
-    }
-  }
-
-  // If an instruction was optimized out then it is possible that some dangling
-  // instructions are left.
-  if (Changed) {
-    auto *OpPredicateInst = dyn_cast<Instruction>(OpPredicate);
-    auto *OpMultiplierInst = dyn_cast<Instruction>(OpMultiplier);
-    if (OpMultiplierInst && OpMultiplierInst->use_empty())
-      OpMultiplierInst->eraseFromParent();
-    if (OpPredicateInst && OpPredicateInst->use_empty())
-      OpPredicateInst->eraseFromParent();
-  }
-
-  return Changed;
-}
-
-bool SVEIntrinsicOpts::optimizeTBL(IntrinsicInst *I) {
-  assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_tbl &&
-         "Unexpected opcode");
-
-  auto *OpVal = I->getOperand(0);
-  auto *OpIndices = I->getOperand(1);
-  VectorType *VTy = cast<VectorType>(I->getType());
-
-  // Check whether OpIndices is an aarch64_sve_dup_x intrinsic call with
-  // constant splat value < minimal element count of result.
-  auto *DupXIntrI = dyn_cast<IntrinsicInst>(OpIndices);
-  if (!DupXIntrI || DupXIntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x)
-    return false;
-
-  auto *SplatValue = dyn_cast<ConstantInt>(DupXIntrI->getOperand(0));
-  if (!SplatValue ||
-      SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
-    return false;
-
-  // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
-  // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
-  LLVMContext &Ctx = I->getContext();
-  IRBuilder<> Builder(Ctx);
-  Builder.SetInsertPoint(I);
-  auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
-  auto *VectorSplat =
-      Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
-
-  I->replaceAllUsesWith(VectorSplat);
-  I->eraseFromParent();
-  if (DupXIntrI->use_empty())
-    DupXIntrI->eraseFromParent();
-  return true;
-}
-
-bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) {
-  IntrinsicInst *IntrI = dyn_cast<IntrinsicInst>(I);
-  if (!IntrI)
-    return false;
-
-  switch (IntrI->getIntrinsicID()) {
-  case Intrinsic::aarch64_sve_fmul:
-  case Intrinsic::aarch64_sve_mul:
-    return optimizeVectorMul(IntrI);
-  case Intrinsic::aarch64_sve_ptest_any:
-  case Intrinsic::aarch64_sve_ptest_first:
-  case Intrinsic::aarch64_sve_ptest_last:
-    return optimizePTest(IntrI);
-  case Intrinsic::aarch64_sve_tbl:
-    return optimizeTBL(IntrI);
-  default:
-    return false;
-  }
-
-  return true;
-}
-
-bool SVEIntrinsicOpts::optimizeIntrinsicCalls(
-    SmallSetVector<Function *, 4> &Functions) {
-  bool Changed = false;
-  for (auto *F : Functions) {
-    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree();
-
-    // Traverse the DT with an rpo walk so we see defs before uses, allowing
-    // simplification to be done incrementally.
-    BasicBlock *Root = DT->getRoot();
-    ReversePostOrderTraversal<BasicBlock *> RPOT(Root);
-    for (auto *BB : RPOT)
-      for (Instruction &I : make_early_inc_range(*BB))
-        Changed |= optimizeIntrinsic(&I);
-  }
-  return Changed;
-}
-
 bool SVEIntrinsicOpts::optimizeFunctions(
     SmallSetVector<Function *, 4> &Functions) {
   bool Changed = false;
 
   Changed |= optimizePTrueIntrinsicCalls(Functions);
-  Changed |= optimizeIntrinsicCalls(Functions);
 
   return Changed;
 }
@@ -480,13 +297,7 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) {
       continue;
 
     switch (F.getIntrinsicID()) {
-    case Intrinsic::aarch64_sve_ptest_any:
-    case Intrinsic::aarch64_sve_ptest_first:
-    case Intrinsic::aarch64_sve_ptest_last:
     case Intrinsic::aarch64_sve_ptrue:
-    case Intrinsic::aarch64_sve_mul:
-    case Intrinsic::aarch64_sve_fmul:
-    case Intrinsic::aarch64_sve_tbl:
       for (User *U : F.users())
         Functions.insert(cast<Instruction>(U)->getFunction());
       break;

diff  --git a/llvm/test/CodeGen/AArch64/sve-fmul-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
similarity index 92%
rename from llvm/test/CodeGen/AArch64/sve-fmul-idempotency.ll
rename to llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
index bf1d3c60a070d..d13f76c2be301 100644
--- a/llvm/test/CodeGen/AArch64/sve-fmul-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll
@@ -1,8 +1,9 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -aarch64-sve-intrinsic-opts < %s | FileCheck %s
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
 
 ; Idempotent fmuls -- should compile to just a ret.
-define <vscale x 8 x half> @idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) {
+define <vscale x 8 x half> @idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
 ; CHECK-LABEL: @idempotent_fmul_f16(
 ; CHECK-NEXT:    ret <vscale x 8 x half> [[A:%.*]]
 ;
@@ -11,7 +12,7 @@ define <vscale x 8 x half> @idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x
   ret <vscale x 8 x half> %2
 }
 
-define <vscale x 4 x float> @idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) {
+define <vscale x 4 x float> @idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) #0 {
 ; CHECK-LABEL: @idempotent_fmul_f32(
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[A:%.*]]
 ;
@@ -20,7 +21,7 @@ define <vscale x 4 x float> @idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale
   ret <vscale x 4 x float> %2
 }
 
-define <vscale x 2 x double> @idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) {
+define <vscale x 2 x double> @idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
 ; CHECK-LABEL: @idempotent_fmul_f64(
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[A:%.*]]
 ;
@@ -29,7 +30,7 @@ define <vscale x 2 x double> @idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale
   ret <vscale x 2 x double> %2
 }
 
-define <vscale x 2 x double> @idempotent_fmul_
diff erent_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) {
+define <vscale x 2 x double> @idempotent_fmul_
diff erent_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
 ; CHECK-LABEL: @idempotent_fmul_
diff erent_argument_order(
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[A:%.*]]
 ;
@@ -39,7 +40,7 @@ define <vscale x 2 x double> @idempotent_fmul_
diff erent_argument_order(<vscale x
   ret <vscale x 2 x double> %2
 }
 
-define <vscale x 8 x half> @idempotent_fmul_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) {
+define <vscale x 8 x half> @idempotent_fmul_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
 ; CHECK-LABEL: @idempotent_fmul_with_predicated_dup(
 ; CHECK-NEXT:    ret <vscale x 8 x half> [[A:%.*]]
 ;
@@ -48,7 +49,7 @@ define <vscale x 8 x half> @idempotent_fmul_with_predicated_dup(<vscale x 8 x i1
   ret <vscale x 8 x half> %2
 }
 
-define <vscale x 8 x half> @idempotent_fmul_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) {
+define <vscale x 8 x half> @idempotent_fmul_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
   ; Edge case -- make sure that the case where we're fmultiplying two dups
   ; together is sane.
 ; CHECK-LABEL: @idempotent_fmul_two_dups(
@@ -62,7 +63,7 @@ define <vscale x 8 x half> @idempotent_fmul_two_dups(<vscale x 8 x i1> %pg, <vsc
 }
 
 ; Non-idempotent fmuls -- we don't expect these to be optimised out.
-define <vscale x 8 x half> @non_idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) {
+define <vscale x 8 x half> @non_idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a) #0 {
 ; CHECK-LABEL: @non_idempotent_fmul_f16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.dup.x.nxv8f16(half 0xH4000)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[TMP1]])
@@ -73,7 +74,7 @@ define <vscale x 8 x half> @non_idempotent_fmul_f16(<vscale x 8 x i1> %pg, <vsca
   ret <vscale x 8 x half> %2
 }
 
-define <vscale x 4 x float> @non_idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) {
+define <vscale x 4 x float> @non_idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a) #0 {
 ; CHECK-LABEL: @non_idempotent_fmul_f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.dup.x.nxv4f32(float 2.000000e+00)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[TMP1]])
@@ -84,7 +85,7 @@ define <vscale x 4 x float> @non_idempotent_fmul_f32(<vscale x 4 x i1> %pg, <vsc
   ret <vscale x 4 x float> %2
 }
 
-define <vscale x 2 x double> @non_idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) {
+define <vscale x 2 x double> @non_idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a) #0 {
 ; CHECK-LABEL: @non_idempotent_fmul_f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double 2.000000e+00)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[TMP1]])
@@ -95,7 +96,7 @@ define <vscale x 2 x double> @non_idempotent_fmul_f64(<vscale x 2 x i1> %pg, <vs
   ret <vscale x 2 x double> %2
 }
 
-define <vscale x 2 x double> @non_idempotent_fmul_with_predicated_dup(<vscale x 2 x i1> %pg1, <vscale x 2 x i1> %pg2, <vscale x 2 x double> %a) {
+define <vscale x 2 x double> @non_idempotent_fmul_with_predicated_dup(<vscale x 2 x i1> %pg1, <vscale x 2 x i1> %pg2, <vscale x 2 x double> %a) #0 {
   ; Different predicates
 ; CHECK-LABEL: @non_idempotent_fmul_with_predicated_dup(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> [[PG1:%.*]], double 1.000000e+00)
@@ -117,3 +118,5 @@ declare <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half>, <
 declare <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
 declare <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
 declare <vscale x 2 x double> @llvm.aarch64.sve.fmul.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+attributes #0 = { "target-features"="+sve" }

diff  --git a/llvm/test/CodeGen/AArch64/sve-mul-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
similarity index 92%
rename from llvm/test/CodeGen/AArch64/sve-mul-idempotency.ll
rename to llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
index 2f6d9f81ce5c5..8e10050714a6f 100644
--- a/llvm/test/CodeGen/AArch64/sve-mul-idempotency.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll
@@ -1,8 +1,9 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -aarch64-sve-intrinsic-opts < %s | FileCheck %s
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
 
 ; Idempotent muls -- should compile to just a ret.
-define <vscale x 8 x i16> @idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+define <vscale x 8 x i16> @idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
 ; CHECK-LABEL: @idempotent_mul_i16(
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[A:%.*]]
 ;
@@ -11,7 +12,7 @@ define <vscale x 8 x i16> @idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8
   ret <vscale x 8 x i16> %2
 }
 
-define <vscale x 4 x i32> @idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+define <vscale x 4 x i32> @idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: @idempotent_mul_i32(
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[A:%.*]]
 ;
@@ -20,7 +21,7 @@ define <vscale x 4 x i32> @idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4
   ret <vscale x 4 x i32> %2
 }
 
-define <vscale x 2 x i64> @idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+define <vscale x 2 x i64> @idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
 ; CHECK-LABEL: @idempotent_mul_i64(
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[A:%.*]]
 ;
@@ -29,7 +30,7 @@ define <vscale x 2 x i64> @idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2
   ret <vscale x 2 x i64> %2
 }
 
-define <vscale x 2 x i64> @idempotent_mul_
diff erent_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+define <vscale x 2 x i64> @idempotent_mul_
diff erent_argument_order(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
 ; CHECK-LABEL: @idempotent_mul_
diff erent_argument_order(
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[A:%.*]]
 ;
@@ -39,7 +40,7 @@ define <vscale x 2 x i64> @idempotent_mul_
diff erent_argument_order(<vscale x 2 x
   ret <vscale x 2 x i64> %2
 }
 
-define <vscale x 8 x i16> @idempotent_mul_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+define <vscale x 8 x i16> @idempotent_mul_with_predicated_dup(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
 ; CHECK-LABEL: @idempotent_mul_with_predicated_dup(
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[A:%.*]]
 ;
@@ -48,7 +49,7 @@ define <vscale x 8 x i16> @idempotent_mul_with_predicated_dup(<vscale x 8 x i1>
   ret <vscale x 8 x i16> %2
 }
 
-define <vscale x 8 x i16> @idempotent_mul_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+define <vscale x 8 x i16> @idempotent_mul_two_dups(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
   ; Edge case -- make sure that the case where we're multiplying two dups
   ; together is sane.
 ; CHECK-LABEL: @idempotent_mul_two_dups(
@@ -62,7 +63,7 @@ define <vscale x 8 x i16> @idempotent_mul_two_dups(<vscale x 8 x i1> %pg, <vscal
 }
 
 ; Non-idempotent muls -- we don't expect these to be optimised out.
-define <vscale x 8 x i16> @non_idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+define <vscale x 8 x i16> @non_idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) #0 {
 ; CHECK-LABEL: @non_idempotent_mul_i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 2)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> [[PG:%.*]], <vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[TMP1]])
@@ -73,7 +74,7 @@ define <vscale x 8 x i16> @non_idempotent_mul_i16(<vscale x 8 x i1> %pg, <vscale
   ret <vscale x 8 x i16> %2
 }
 
-define <vscale x 4 x i32> @non_idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+define <vscale x 4 x i32> @non_idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: @non_idempotent_mul_i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 2)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[TMP1]])
@@ -84,7 +85,7 @@ define <vscale x 4 x i32> @non_idempotent_mul_i32(<vscale x 4 x i1> %pg, <vscale
   ret <vscale x 4 x i32> %2
 }
 
-define <vscale x 2 x i64> @non_idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+define <vscale x 2 x i64> @non_idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
 ; CHECK-LABEL: @non_idempotent_mul_i64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 2)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]], <vscale x 2 x i64> [[TMP1]])
@@ -95,7 +96,7 @@ define <vscale x 2 x i64> @non_idempotent_mul_i64(<vscale x 2 x i1> %pg, <vscale
   ret <vscale x 2 x i64> %2
 }
 
-define <vscale x 2 x i64> @non_idempotent_mul_with_predicated_dup(<vscale x 2 x i1> %pg1, <vscale x 2 x i1> %pg2, <vscale x 2 x i64> %a) {
+define <vscale x 2 x i64> @non_idempotent_mul_with_predicated_dup(<vscale x 2 x i1> %pg1, <vscale x 2 x i1> %pg2, <vscale x 2 x i64> %a) #0 {
   ; Different predicates
 ; CHECK-LABEL: @non_idempotent_mul_with_predicated_dup(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> [[PG1:%.*]], i64 1)
@@ -117,3 +118,5 @@ declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16>, <vs
 declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+attributes #0 = { "target-features"="+sve" }

diff  --git a/llvm/test/CodeGen/AArch64/sve-tbl-dupx.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-tbl-dupx.ll
similarity index 96%
rename from llvm/test/CodeGen/AArch64/sve-tbl-dupx.ll
rename to llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-tbl-dupx.ll
index 6c4bba79ce84b..e46879fe645e8 100644
--- a/llvm/test/CodeGen/AArch64/sve-tbl-dupx.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-tbl-dupx.ll
@@ -1,9 +1,10 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -aarch64-sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
 
 ; op2 = tbl(op1 dup_x(idx)) -> op2 = vector_splat(extractelement(op1, idx))
 
-define <vscale x 16 x i8> @dup_ext_i8(<vscale x 16 x i8> %data) {
+define <vscale x 16 x i8> @dup_ext_i8(<vscale x 16 x i8> %data) #0 {
 ; CHECK-LABEL: @dup_ext_i8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <vscale x 16 x i8> [[DATA:%.*]], i8 1
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP1]], i32 0
@@ -15,7 +16,7 @@ define <vscale x 16 x i8> @dup_ext_i8(<vscale x 16 x i8> %data) {
   ret <vscale x 16 x i8> %out
 }
 
-define <vscale x 8 x i16> @dup_ext_i16(<vscale x 8 x i16> %data) {
+define <vscale x 8 x i16> @dup_ext_i16(<vscale x 8 x i16> %data) #0 {
 ; CHECK-LABEL: @dup_ext_i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <vscale x 8 x i16> [[DATA:%.*]], i16 1
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP1]], i32 0
@@ -27,7 +28,7 @@ define <vscale x 8 x i16> @dup_ext_i16(<vscale x 8 x i16> %data) {
   ret <vscale x 8 x i16> %out
 }
 
-define <vscale x 4 x i32> @dup_ext_i32(<vscale x 4 x i32> %data) {
+define <vscale x 4 x i32> @dup_ext_i32(<vscale x 4 x i32> %data) #0 {
 ; CHECK-LABEL: @dup_ext_i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <vscale x 4 x i32> [[DATA:%.*]], i32 1
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP1]], i32 0
@@ -39,7 +40,7 @@ define <vscale x 4 x i32> @dup_ext_i32(<vscale x 4 x i32> %data) {
   ret <vscale x 4 x i32> %out
 }
 
-define <vscale x 2 x i64> @dup_ext_i64(<vscale x 2 x i64> %data) {
+define <vscale x 2 x i64> @dup_ext_i64(<vscale x 2 x i64> %data) #0 {
 ; CHECK-LABEL: @dup_ext_i64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <vscale x 2 x i64> [[DATA:%.*]], i64 1
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP1]], i32 0
@@ -51,7 +52,7 @@ define <vscale x 2 x i64> @dup_ext_i64(<vscale x 2 x i64> %data) {
   ret <vscale x 2 x i64> %out
 }
 
-define <vscale x 8 x half> @dup_ext_f16(<vscale x 8 x half> %data) {
+define <vscale x 8 x half> @dup_ext_f16(<vscale x 8 x half> %data) #0 {
 ; CHECK-LABEL: @dup_ext_f16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <vscale x 8 x half> [[DATA:%.*]], i16 1
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x half> poison, half [[TMP1]], i32 0
@@ -63,7 +64,7 @@ define <vscale x 8 x half> @dup_ext_f16(<vscale x 8 x half> %data) {
   ret <vscale x 8 x half> %out
 }
 
-define <vscale x 4 x float> @dup_ext_f32(<vscale x 4 x float> %data) {
+define <vscale x 4 x float> @dup_ext_f32(<vscale x 4 x float> %data) #0 {
 ; CHECK-LABEL: @dup_ext_f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <vscale x 4 x float> [[DATA:%.*]], i32 1
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[TMP1]], i32 0
@@ -75,7 +76,7 @@ define <vscale x 4 x float> @dup_ext_f32(<vscale x 4 x float> %data) {
   ret <vscale x 4 x float> %out
 }
 
-define <vscale x 2 x double> @dup_ext_f64(<vscale x 2 x double> %data) {
+define <vscale x 2 x double> @dup_ext_f64(<vscale x 2 x double> %data) #0 {
 ; CHECK-LABEL: @dup_ext_f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <vscale x 2 x double> [[DATA:%.*]], i64 1
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP1]], i32 0
@@ -98,3 +99,5 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.tbl.nxv2i64( <vscale x 2 x i64>, <v
 declare <vscale x 8 x half> @llvm.aarch64.sve.tbl.nxv8f16( <vscale x 8 x half>, <vscale x 8 x i16>)
 declare <vscale x 4 x float> @llvm.aarch64.sve.tbl.nxv4f32( <vscale x 4 x float>, <vscale x 4 x i32>)
 declare <vscale x 2 x double> @llvm.aarch64.sve.tbl.nxv2f64( <vscale x 2 x double>, <vscale x 2 x i64>)
+
+attributes #0 = { "target-features"="+sve" }

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-ptest.ll
similarity index 58%
rename from llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll
rename to llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-ptest.ll
index 69766eda3e0d7..7ae35d37a39e4 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsics-ptest.ll
@@ -1,11 +1,13 @@
-; RUN: opt -S -aarch64-sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefix OPT %s
+; RUN: opt -S -instcombine < %s | FileCheck %s
 
-define i1 @ptest_any1(<vscale x 2 x i1> %a) {
-; OPT-LABEL: ptest_any1
-; OPT: %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 0)
-; OPT-NOT: convert
-; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.any.nxv2i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %a)
-; OPT-NEXT: ret i1 %[[OUT]]
+target triple = "aarch64-unknown-linux-gnu"
+
+define i1 @ptest_any1(<vscale x 2 x i1> %a) #0 {
+; CHECK-LABEL: ptest_any1
+; CHECK: %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 0)
+; CHECK-NOT: convert
+; CHECK-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.any.nxv2i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %a)
+; CHECK-NEXT: ret i1 %[[OUT]]
   %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 0)
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %mask)
   %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
@@ -14,12 +16,12 @@ define i1 @ptest_any1(<vscale x 2 x i1> %a) {
 }
 
 ; No transform because the ptest is using 
diff erently sized operands.
-define i1 @ptest_any2(<vscale x 4 x i1> %a) {
-; OPT-LABEL: ptest_any2
-; OPT: %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; OPT-NEXT: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %mask)
-; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
-; OPT-NEXT: %out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
+define i1 @ptest_any2(<vscale x 4 x i1> %a) #0 {
+; CHECK-LABEL: ptest_any2
+; CHECK: %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %mask)
+; CHECK-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
+; CHECK-NEXT: %out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
   %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %mask)
   %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
@@ -27,12 +29,12 @@ define i1 @ptest_any2(<vscale x 4 x i1> %a) {
   ret i1 %out
 }
 
-define i1 @ptest_first(<vscale x 4 x i1> %a) {
-; OPT-LABEL: ptest_first
-; OPT: %mask = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0)
-; OPT-NOT: convert
-; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.first.nxv4i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %a)
-; OPT-NEXT: ret i1 %[[OUT]]
+define i1 @ptest_first(<vscale x 4 x i1> %a) #0 {
+; CHECK-LABEL: ptest_first
+; CHECK: %mask = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0)
+; CHECK-NOT: convert
+; CHECK-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.first.nxv4i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %a)
+; CHECK-NEXT: ret i1 %[[OUT]]
   %mask = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0)
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %mask)
   %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
@@ -40,22 +42,22 @@ define i1 @ptest_first(<vscale x 4 x i1> %a) {
   ret i1 %out
 }
 
-define i1 @ptest_first_same_ops(<vscale x 2 x i1> %a) {
-; OPT-LABEL: ptest_first_same_ops
-; OPT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.first.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %a)
-; OPT-NOT: convert
-; OPT-NEXT: ret i1 %[[OUT]]
+define i1 @ptest_first_same_ops(<vscale x 2 x i1> %a) #0 {
+; CHECK-LABEL: ptest_first_same_ops
+; CHECK: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.first.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %a)
+; CHECK-NOT: convert
+; CHECK-NEXT: ret i1 %[[OUT]]
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
   %2 = tail call i1 @llvm.aarch64.sve.ptest.first.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %1)
   ret i1 %2
 }
 
-define i1 @ptest_last(<vscale x 8 x i1> %a) {
-; OPT-LABEL: ptest_last
-; OPT: %mask = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0)
-; OPT-NOT: convert
-; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.last.nxv8i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %a)
-; OPT-NEXT: ret i1 %[[OUT]]
+define i1 @ptest_last(<vscale x 8 x i1> %a) #0 {
+; CHECK-LABEL: ptest_last
+; CHECK: %mask = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0)
+; CHECK-NOT: convert
+; CHECK-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.last.nxv8i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %a)
+; CHECK-NEXT: ret i1 %[[OUT]]
   %mask = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0)
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %mask)
   %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
@@ -75,3 +77,5 @@ declare i1 @llvm.aarch64.sve.ptest.last.nxv16i1(<vscale x 16 x i1>, <vscale x 16
 declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
 declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>)
 declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
+
+attributes #0 = { "target-features"="+sve" }