[llvm] c9439ca - [AArch64][SVE] Coalesce calls to the SVE ptrue intrinsic where possible

Thu Feb 4 08:31:05 PST 2021

It looks like you pushed this commit to the branch "master" -- which isn't
in use anymore -- rather than "main".

On Thu, Feb 4, 2021 at 9:11 AM Joe Ellis via llvm-commits <
llvm-commits at lists.llvm.org> wrote:

>
> Author: Joe Ellis
> Date: 2021-02-04T14:10:50Z
> New Revision: c9439ca36342fb6013187d0a69aef92736951476
>
> URL:
> https://github.com/llvm/llvm-project/commit/c9439ca36342fb6013187d0a69aef92736951476
> DIFF:
> https://github.com/llvm/llvm-project/commit/c9439ca36342fb6013187d0a69aef92736951476.diff
>
> LOG: [AArch64][SVE] Coalesce calls to the SVE ptrue intrinsic where
> possible
>
> It is possible to eliminate redundant calls to the SVE ptrue intrinsic.
> For example: suppose that we have two SVE ptrue intrinsic calls P1 and
> P2. If P1 is at least as wide as P2, then P2 can be written as a
> reinterpret P1 using the SVE reinterpret intrinsics.
>
> Coalescing ptrue intrinsics can result in fewer ptrue instructions in
> the codegen, and is conducive to better analysis further down the line.
>
> This commit extends the aarch64-sve-intrinsic-opts pass to support
> coalescing ptrue intrisic calls.
>
> Differential Revision: https://reviews.llvm.org/D94230
>
> Added:
>     llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll
>
> Modified:
>     llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
>
> Removed:
>
>
>
>
> ################################################################################
> diff  --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
> b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
> index 9911f33371c6..3d9080f7997d 100644
> --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
> +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
> @@ -9,14 +9,20 @@
>  //
>  // Performs general IR level optimizations on SVE intrinsics.
>  //
> -// The main goal of this pass is to remove unnecessary reinterpret
> -// intrinsics (llvm.aarch64.sve.convert.[to|from].svbool), e.g:
> +// This pass performs the following optimizations:
>  //
> -//   %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>
> %a)
> -//   %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>
> %1)
> +// - removes unnecessary reinterpret intrinsics
> +//   (llvm.aarch64.sve.convert.[to|from].svbool), e.g:
> +//     %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>
> %a)
> +//     %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x
> i1> %1)
>  //
> -// This pass also looks for ptest intrinsics & phi instructions where the
> -// operands are being needlessly converted to and from svbool_t.
> +// - removes unnecessary ptrue intrinsics (llvm.aarch64.sve.ptrue), e.g:
> +//     %1 = @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
> +//     %2 = @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
> +//     ; (%1 can be replaced with a reinterpret of %2)
> +//
> +// - optimizes ptest intrinsics and phi instructions where the operands
> are
> +//   being needlessly converted to and from svbool_t.
>  //
>
>  //===----------------------------------------------------------------------===//
>
> @@ -56,8 +62,17 @@ struct SVEIntrinsicOpts : public ModulePass {
>  private:
>    static IntrinsicInst *isReinterpretToSVBool(Value *V);
>
> +  bool coalescePTrueIntrinsicCalls(BasicBlock &BB,
> +                                   SmallSetVector<IntrinsicInst *, 4>
> &PTrues);
> +  bool optimizePTrueIntrinsicCalls(SmallSetVector<Function *, 4>
> &Functions);
> +
> +  /// Operates at the instruction-scope. I.e., optimizations are applied
> local
> +  /// to individual instructions.
>    static bool optimizeIntrinsic(Instruction *I);
> +  bool optimizeIntrinsicCalls(SmallSetVector<Function *, 4> &Functions);
>
> +  /// Operates at the function-scope. I.e., optimizations are applied
> local to
> +  /// the functions themselves.
>    bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
>
>    static bool optimizeConvertFromSVBool(IntrinsicInst *I);
> @@ -95,6 +110,188 @@ IntrinsicInst
> *SVEIntrinsicOpts::isReinterpretToSVBool(Value *V) {
>    return I;
>  }
>
> +/// Checks if a ptrue intrinsic call is promoted. The act of promoting a
> +/// ptrue will introduce zeroing. For example:
> +///
> +///     %1 = <vscale x 4 x i1> call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
> +///     %2 = <vscale x 16 x i1> call @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale
> x 4 x i1> %1)
> +///     %3 = <vscale x 8 x i1> call
> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %2)
> +///
> +/// %1 is promoted, because it is converted:
> +///
> +///     <vscale x 4 x i1> => <vscale x 16 x i1> => <vscale x 8 x i1>
> +///
> +/// via a sequence of the SVE reinterpret intrinsics
> convert.{to,from}.svbool.
> +bool isPTruePromoted(IntrinsicInst *PTrue) {
> +  // Find all users of this intrinsic that are calls to convert-to-svbool
> +  // reinterpret intrinsics.
> +  SmallVector<IntrinsicInst *, 4> ConvertToUses;
> +  for (User *User : PTrue->users()) {
> +    if (match(User,
> m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>())) {
> +      ConvertToUses.push_back(cast<IntrinsicInst>(User));
> +    }
> +  }
> +
> +  // If no such calls were found, this is ptrue is not promoted.
> +  if (ConvertToUses.empty())
> +    return false;
> +
> +  // Otherwise, try to find users of the convert-to-svbool intrinsics
> that are
> +  // calls to the convert-from-svbool intrinsic, and would result in some
> lanes
> +  // being zeroed.
> +  const auto *PTrueVTy = cast<ScalableVectorType>(PTrue->getType());
> +  for (IntrinsicInst *ConvertToUse : ConvertToUses) {
> +    for (User *User : ConvertToUse->users()) {
> +      auto *IntrUser = dyn_cast<IntrinsicInst>(User);
> +      if (IntrUser && IntrUser->getIntrinsicID() ==
> +                          Intrinsic::aarch64_sve_convert_from_svbool) {
> +        const auto *IntrUserVTy =
> cast<ScalableVectorType>(IntrUser->getType());
> +
> +        // Would some lanes become zeroed by the conversion?
> +        if (IntrUserVTy->getElementCount().getKnownMinValue() >
> +            PTrueVTy->getElementCount().getKnownMinValue())
> +          // This is a promoted ptrue.
> +          return true;
> +      }
> +    }
> +  }
> +
> +  // If no matching calls were found, this is not a promoted ptrue.
> +  return false;
> +}
> +
> +/// Attempts to coalesce ptrues in a basic block.
> +bool SVEIntrinsicOpts::coalescePTrueIntrinsicCalls(
> +    BasicBlock &BB, SmallSetVector<IntrinsicInst *, 4> &PTrues) {
> +  if (PTrues.size() <= 1)
> +    return false;
> +
> +  // Find the ptrue with the most lanes.
> +  auto *MostEncompassingPTrue = *std::max_element(
> +      PTrues.begin(), PTrues.end(), [](auto *PTrue1, auto *PTrue2) {
> +        auto *PTrue1VTy = cast<ScalableVectorType>(PTrue1->getType());
> +        auto *PTrue2VTy = cast<ScalableVectorType>(PTrue2->getType());
> +        return PTrue1VTy->getElementCount().getKnownMinValue() <
> +               PTrue2VTy->getElementCount().getKnownMinValue();
> +      });
> +
> +  // Remove the most encompassing ptrue, as well as any promoted ptrues,
> leaving
> +  // behind only the ptrues to be coalesced.
> +  PTrues.remove(MostEncompassingPTrue);
> +  PTrues.remove_if([](auto *PTrue) { return isPTruePromoted(PTrue); });
> +
> +  // Hoist MostEncompassingPTrue to the start of the basic block. It is
> always
> +  // safe to do this, since ptrue intrinsic calls are guaranteed to have
> no
> +  // predecessors.
> +  MostEncompassingPTrue->moveBefore(BB, BB.getFirstInsertionPt());
> +
> +  LLVMContext &Ctx = BB.getContext();
> +  IRBuilder<> Builder(Ctx);
> +  Builder.SetInsertPoint(&BB, ++MostEncompassingPTrue->getIterator());
> +
> +  auto *MostEncompassingPTrueVTy =
> +      cast<VectorType>(MostEncompassingPTrue->getType());
> +  auto *ConvertToSVBool = Builder.CreateIntrinsic(
> +      Intrinsic::aarch64_sve_convert_to_svbool,
> {MostEncompassingPTrueVTy},
> +      {MostEncompassingPTrue});
> +
> +  for (auto *PTrue : PTrues) {
> +    auto *PTrueVTy = cast<VectorType>(PTrue->getType());
> +
> +    Builder.SetInsertPoint(&BB, ++ConvertToSVBool->getIterator());
> +    auto *ConvertFromSVBool =
> +
> Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
> +                                {PTrueVTy}, {ConvertToSVBool});
> +    PTrue->replaceAllUsesWith(ConvertFromSVBool);
> +    PTrue->eraseFromParent();
> +  }
> +
> +  return true;
> +}
> +
> +/// The goal of this function is to remove redundant calls to the SVE
> ptrue
> +/// intrinsic in each basic block within the given functions.
> +///
> +/// SVE ptrues have two representations in LLVM IR:
> +/// - a logical representation -- an arbitrary-width scalable vector of
> i1s,
> +///   i.e. <vscale x N x i1>.
> +/// - a physical representation (svbool, <vscale x 16 x i1>) -- a
> 16-element
> +///   scalable vector of i1s, i.e. <vscale x 16 x i1>.
> +///
> +/// The SVE ptrue intrinsic is used to create a logical representation of
> an SVE
> +/// predicate. Suppose that we have two SVE ptrue intrinsic calls: P1 and
> P2. If
> +/// P1 creates a logical SVE predicate that is at least as wide as the
> logical
> +/// SVE predicate created by P2, then all of the bits that are true in the
> +/// physical representation of P2 are necessarily also true in the
> physical
> +/// representation of P1. P1 'encompasses' P2, therefore, the intrinsic
> call to
> +/// P2 is redundant and can be replaced by an SVE reinterpret of P1 via
> +/// convert.{to,from}.svbool.
> +///
> +/// Currently, this pass only coalesces calls to SVE ptrue intrinsics
> +/// if they match the following conditions:
> +///
> +/// - the call to the intrinsic uses either the SV_ALL or SV_POW2
> patterns.
> +///   SV_ALL indicates that all bits of the predicate vector are to be
> set to
> +///   true. SV_POW2 indicates that all bits of the predicate vector up to
> the
> +///   largest power-of-two are to be set to true.
> +/// - the result of the call to the intrinsic is not promoted to a wider
> +///   predicate. In this case, keeping the extra ptrue leads to better
> codegen
> +///   -- coalescing here would create an irreducible chain of SVE
> reinterprets
> +///   via convert.{to,from}.svbool.
> +///
> +/// EXAMPLE:
> +///
> +///     %1 = <vscale x 8 x i1> ptrue(i32 SV_ALL)
> +///     ; Logical:  <1, 1, 1, 1, 1, 1, 1, 1>
> +///     ; Physical: <1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0>
> +///     ...
> +///
> +///     %2 = <vscale x 4 x i1> ptrue(i32 SV_ALL)
> +///     ; Logical:  <1, 1, 1, 1>
> +///     ; Physical: <1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0>
> +///     ...
> +///
> +/// Here, %2 can be replaced by an SVE reinterpret of %1, giving, for
> instance:
> +///
> +///     %1 = <vscale x 8 x i1> ptrue(i32 i31)
> +///     %2 = <vscale x 16 x i1> convert.to.svbool(<vscale x 8 x i1> %1)
> +///     %3 = <vscale x 4 x i1> convert.from.svbool(<vscale x 16 x i1> %2)
> +///
> +bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls(
> +    SmallSetVector<Function *, 4> &Functions) {
> +  bool Changed = false;
> +
> +  for (auto *F : Functions) {
> +    for (auto &BB : *F) {
> +      SmallSetVector<IntrinsicInst *, 4> SVAllPTrues;
> +      SmallSetVector<IntrinsicInst *, 4> SVPow2PTrues;
> +
> +      // For each basic block, collect the used ptrues and try to
> coalesce them.
> +      for (Instruction &I : BB) {
> +        if (I.use_empty())
> +          continue;
> +
> +        auto *IntrI = dyn_cast<IntrinsicInst>(&I);
> +        if (!IntrI || IntrI->getIntrinsicID() !=
> Intrinsic::aarch64_sve_ptrue)
> +          continue;
> +
> +        const auto PTruePattern =
> +            cast<ConstantInt>(IntrI->getOperand(0))->getZExtValue();
> +
> +        if (PTruePattern == AArch64SVEPredPattern::all)
> +          SVAllPTrues.insert(IntrI);
> +        if (PTruePattern == AArch64SVEPredPattern::pow2)
> +          SVPow2PTrues.insert(IntrI);
> +      }
> +
> +      Changed |= coalescePTrueIntrinsicCalls(BB, SVAllPTrues);
> +      Changed |= coalescePTrueIntrinsicCalls(BB, SVPow2PTrues);
> +    }
> +  }
> +
> +  return Changed;
> +}
> +
>  /// The function will remove redundant reinterprets casting in the
> presence
>  /// of the control flow
>  bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) {
> @@ -243,7 +440,7 @@ bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction
> *I) {
>    return true;
>  }
>
> -bool SVEIntrinsicOpts::optimizeFunctions(
> +bool SVEIntrinsicOpts::optimizeIntrinsicCalls(
>      SmallSetVector<Function *, 4> &Functions) {
>    bool Changed = false;
>    for (auto *F : Functions) {
> @@ -260,6 +457,16 @@ bool SVEIntrinsicOpts::optimizeFunctions(
>    return Changed;
>  }
>
> +bool SVEIntrinsicOpts::optimizeFunctions(
> +    SmallSetVector<Function *, 4> &Functions) {
> +  bool Changed = false;
> +
> +  Changed |= optimizePTrueIntrinsicCalls(Functions);
> +  Changed |= optimizeIntrinsicCalls(Functions);
> +
> +  return Changed;
> +}
> +
>  bool SVEIntrinsicOpts::runOnModule(Module &M) {
>    bool Changed = false;
>    SmallSetVector<Function *, 4> Functions;
> @@ -276,6 +483,7 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) {
>      case Intrinsic::aarch64_sve_ptest_any:
>      case Intrinsic::aarch64_sve_ptest_first:
>      case Intrinsic::aarch64_sve_ptest_last:
> +    case Intrinsic::aarch64_sve_ptrue:
>        for (User *U : F.users())
>          Functions.insert(cast<Instruction>(U)->getFunction());
>        break;
>
> diff  --git a/llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll
> b/llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll
> new file mode 100644
> index 000000000000..526f7bc52f59
> --- /dev/null
> +++ b/llvm/test/CodeGen/AArch64/sve-coalesce-ptrue-intrinsics.ll
> @@ -0,0 +1,189 @@
> +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
> +; RUN: opt -S -aarch64-sve-intrinsic-opts -mtriple=aarch64-linux-gnu
> -mattr=+sve < %s 2>%t | FileCheck %s
> +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
> +
> +; If this check fails please read test/CodeGen/AArch64/README for
> instructions on how to resolve it.
> +; WARN-NOT: warning
> +
> +declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg)
> +declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 immarg)
> +declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 immarg)
> +declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
> +
> +declare <vscale x 16 x i32> @llvm.aarch64.sve.ld1.nxv16i32(<vscale x 16 x
> i1>, i32*)
> +declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x
> i1>, i32*)
> +declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x
> i1>, i32*)
> +declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x
> i1>, i16*)
> +declare <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x
> i1>, i32*)
> +
> +declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale
> x 4 x i1>)
> +declare <vscale x 8 x i1>
> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
> +
> +; Two calls to the SVE ptrue intrinsic. %1 is redundant, and can be
> expressed as an SVE reinterpret of %3 via
> +; convert.{to,from}.svbool.
> +define <vscale x 8 x i32> @coalesce_test_basic(i32* %addr) {
> +; CHECK-LABEL: @coalesce_test_basic(
> +; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1>
> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
> +; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i1> @
> llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
> +; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i1>
> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP2]])
> +; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i32>
> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP3]], i32* [[ADDR:%.*]])
> +; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i32>
> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> [[TMP1]], i32* [[ADDR]])
> +; CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP5]]
> +;
> +  %1 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
> +  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4
> x i1> %1, i32* %addr)
> +  %3 = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
> +  %4 = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8
> x i1> %3, i32* %addr)
> +  ret <vscale x 8 x i32> %4
> +}
> +
> +; Two calls to the SVE ptrue intrinsic with the SV_POW2 pattern. This
> should reduce to the same output as
> +; coalesce_test_basic.
> +define <vscale x 8 x i32> @coalesce_test_pow2(i32* %addr) {
> +; CHECK-LABEL: @coalesce_test_pow2(
> +; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1>
> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0)
> +; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i1> @
> llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
> +; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i1>
> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP2]])
> +; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i32>
> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP3]], i32* [[ADDR:%.*]])
> +; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i32>
> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> [[TMP1]], i32* [[ADDR]])
> +; CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP5]]
> +;
> +  %1 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0)
> +  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4
> x i1> %1, i32* %addr)
> +  %3 = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0)
> +  %4 = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8
> x i1> %3, i32* %addr)
> +  ret <vscale x 8 x i32> %4
> +}
> +
> +; Four calls to the SVE ptrue intrinsic; two with the SV_ALL patterns,
> and two with the SV_POW2 pattern. The
> +; two SV_ALL ptrue intrinsics should be coalesced, and the two SV_POW2
> intrinsics should be colaesced.
> +define <vscale x 8 x i32> @coalesce_test_all_and_pow2(i32* %addr) {
> +; CHECK-LABEL: @coalesce_test_all_and_pow2(
> +; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1>
> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0)
> +; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i1> @
> llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
> +; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i1>
> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP2]])
> +; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i1>
> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
> +; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i1> @
> llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP4]])
> +; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i1>
> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP5]])
> +; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32>
> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP3]], i32* [[ADDR:%.*]])
> +; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i32>
> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> [[TMP1]], i32* [[ADDR]])
> +; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32>
> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP6]], i32* [[ADDR]])
> +; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 8 x i32>
> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> [[TMP4]], i32* [[ADDR]])
> +; CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP10]]
> +;
> +  %1 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0)
> +  %2 = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0)
> +  %3 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
> +  %4 = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
> +
> +  %5 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4
> x i1> %1, i32* %addr)
> +  %6 = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8
> x i1> %2, i32* %addr)
> +  %7 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4
> x i1> %3, i32* %addr)
> +  %8 = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8
> x i1> %4, i32* %addr)
> +  ret <vscale x 8 x i32> %8
> +}
> +
> +
> +; Two calls to the SVE ptrue intrinsic: one with the SV_ALL pattern,
> another with the SV_POW2 pattern. The
> +; patterns are incompatible, so they should not be coalesced.
> +define <vscale x 8 x i32> @coalesce_test_pattern_mismatch2(i32* %addr) {
> +; CHECK-LABEL: @coalesce_test_pattern_mismatch2(
> +; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1>
> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0)
> +; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32>
> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP1]], i32* [[ADDR:%.*]])
> +; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i1>
> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
> +; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i32>
> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> [[TMP3]], i32* [[ADDR]])
> +; CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
> +;
> +  %1 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0)
> +  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4
> x i1> %1, i32* %addr)
> +  %3 = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
> +  %4 = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8
> x i1> %3, i32* %addr)
> +  ret <vscale x 8 x i32> %4
> +}
> +
> +; Two calls to the SVE ptrue intrinsic with the SV_VL1 pattern. This
> pattern is not currently recognised, so
> +; nothing should be done here.
> +define <vscale x 8 x i32> @coalesce_test_bad_pattern(i32* %addr) {
> +; CHECK-LABEL: @coalesce_test_bad_pattern(
> +; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1>
> @llvm.aarch64.sve.ptrue.nxv4i1(i32 1)
> +; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32>
> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP1]], i32* [[ADDR:%.*]])
> +; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i1>
> @llvm.aarch64.sve.ptrue.nxv8i1(i32 1)
> +; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i32>
> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> [[TMP3]], i32* [[ADDR]])
> +; CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
> +;
> +  %1 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 1)
> +  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4
> x i1> %1, i32* %addr)
> +  %3 = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 1)
> +  %4 = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8
> x i1> %3, i32* %addr)
> +  ret <vscale x 8 x i32> %4
> +}
> +
> +; Four calls to the SVE ptrue intrinsic. %7 is the most encompassing, and
> the others can be expressed as an
> +; SVE reinterprets of %7 via convert.{to,from}.svbool.
> +define <vscale x 16 x i32> @coalesce_test_multiple(i32* %addr) {
> +; CHECK-LABEL: @coalesce_test_multiple(
> +; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i1>
> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
> +; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i1> @
> llvm.aarch64.sve.convert.to.svbool.nxv16i1(<vscale x 16 x i1> [[TMP1]])
> +; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i1>
> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP2]])
> +; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i1>
> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[TMP2]])
> +; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i1>
> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[TMP2]])
> +; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i32>
> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1> [[TMP5]], i32* [[ADDR:%.*]])
> +; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32>
> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP4]], i32* [[ADDR]])
> +; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i32>
> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8 x i1> [[TMP3]], i32* [[ADDR]])
> +; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 16 x i32>
> @llvm.aarch64.sve.ld1.nxv16i32(<vscale x 16 x i1> [[TMP1]], i32* [[ADDR]])
> +; CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP9]]
> +;
> +  %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
> +  %2 = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2
> x i1> %1, i32* %addr)
> +  %3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
> +  %4 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4
> x i1> %3, i32* %addr)
> +  %5 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
> +  %6 = call <vscale x 8 x i32> @llvm.aarch64.sve.ld1.nxv8i32(<vscale x 8
> x i1> %5, i32* %addr)
> +  %7 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32
> 31)
> +  %8 = call <vscale x 16 x i32> @llvm.aarch64.sve.ld1.nxv16i32(<vscale x
> 16 x i1> %7, i32* %addr)
> +  ret <vscale x 16 x i32> %8
> +}
> +
> +; Two calls to the SVE ptrue intrinsic which are both of the same size.
> In this case, one should be identified
> +; as redundant and rewritten and an SVE reinterpret of the other via the
> convert.{to,from}.svbool intrinsics.
> +; This introduces a redundant conversion which will then be eliminated.
> +define <vscale x 4 x i32> @coalesce_test_same_size(i32* %addr) {
> +; CHECK-LABEL: @coalesce_test_same_size(
> +; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1>
> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
> +; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32>
> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP1]], i32* [[ADDR:%.*]])
> +; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32>
> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP1]], i32* [[ADDR]])
> +; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
> +;
> +  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
> +  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4
> x i1> %1, i32* %addr)
> +  %3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
> +  %4 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4
> x i1> %3, i32* %addr)
> +  ret <vscale x 4 x i32> %4
> +}
> +
> +; Two calls to the SVE ptrue intrinsic, but neither can be eliminated; %1
> is promoted to become %3, which
> +; means eliminating this call to the SVE ptrue intrinsic would involve
> creating a longer, irreducible chain of
> +; conversions. Better codegen is achieved by just leaving the ptrue as-is.
> +define <vscale x 8 x i16> @coalesce_test_promoted_ptrue(i32* %addr1, i16*
> %addr2) {
> +; CHECK-LABEL: @coalesce_test_promoted_ptrue(
> +; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i1>
> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
> +; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i1> @
> llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> [[TMP1]])
> +; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i1>
> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
> +; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i1> @
> llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> [[TMP3]])
> +; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i1>
> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP4]])
> +; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32>
> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> [[TMP3]], i32*
> [[ADDR1:%.*]])
> +; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 8 x i16>
> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> [[TMP5]], i16*
> [[ADDR2:%.*]])
> +; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i16>
> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> [[TMP1]], i16* [[ADDR2]])
> +; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP8]]
> +;
> +  %1 = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
> +  %2 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale
> x 4 x i1> %1)
> +  %3 = call <vscale x 8 x i1>
> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %2)
> +
> +  %4 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4
> x i1> %1, i32* %addr1)
> +  %5 = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8
> x i1> %3, i16* %addr2)
> +
> +  %6 = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
> +  %7 = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8
> x i1> %6, i16* %addr2)
> +  ret <vscale x 8 x i16> %7
> +}
>
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20210204/3ae59bce/attachment.html>