[llvm] c91cd4f - [AArch64][SVE][InstCombine] Replace last{a,b} intrinsics with extracts...

Tue Apr 20 03:01:54 PDT 2021

Author: Joe Ellis
Date: 2021-04-20T10:01:33Z
New Revision: c91cd4f3bb53f6f3b2cbfd6269ebb88eef410246

URL: https://github.com/llvm/llvm-project/commit/c91cd4f3bb53f6f3b2cbfd6269ebb88eef410246
DIFF: https://github.com/llvm/llvm-project/commit/c91cd4f3bb53f6f3b2cbfd6269ebb88eef410246.diff

LOG: [AArch64][SVE][InstCombine] Replace last{a,b} intrinsics with extracts...

when the predicate used by last{a,b} specifies a known vector length.

For example:
  aarch64_sve_lasta(VL1, D) -> extractelement(D, #1)
  aarch64_sve_lastb(VL1, D) -> extractelement(D, #0)

Co-authored-by: Paul Walker <paul.walker at arm.com>

Differential Revision: https://reviews.llvm.org/D100476

Added: 
    llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-lasta-lastb.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 754e4cf207e7c..f98a01e94a20c 100644

--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <algorithm>
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -279,6 +280,101 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
 }
 
+static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
+                                                  IntrinsicInst &II) {
+  Value *Pg = II.getArgOperand(0);
+  Value *Vec = II.getArgOperand(1);
+  bool IsAfter = II.getIntrinsicID() == Intrinsic::aarch64_sve_lasta;
+
+  auto *C = dyn_cast<Constant>(Pg);
+  if (IsAfter && C && C->isNullValue()) {
+    // The intrinsic is extracting lane 0 so use an extract instead.
+    auto *IdxTy = Type::getInt64Ty(II.getContext());
+    auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
+    Extract->insertBefore(&II);
+    Extract->takeName(&II);
+    return IC.replaceInstUsesWith(II, Extract);
+  }
+
+  auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
+  if (!IntrPG)
+    return None;
+
+  if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
+    return None;
+
+  const auto PTruePattern =
+      cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
+
+  // Can the intrinsic's predicate be converted to a known constant index?
+  unsigned Idx;
+  switch (PTruePattern) {
+  default:
+    return None;
+  case AArch64SVEPredPattern::vl1:
+    Idx = 0;
+    break;
+  case AArch64SVEPredPattern::vl2:
+    Idx = 1;
+    break;
+  case AArch64SVEPredPattern::vl3:
+    Idx = 2;
+    break;
+  case AArch64SVEPredPattern::vl4:
+    Idx = 3;
+    break;
+  case AArch64SVEPredPattern::vl5:
+    Idx = 4;
+    break;
+  case AArch64SVEPredPattern::vl6:
+    Idx = 5;
+    break;
+  case AArch64SVEPredPattern::vl7:
+    Idx = 6;
+    break;
+  case AArch64SVEPredPattern::vl8:
+    Idx = 7;
+    break;
+  case AArch64SVEPredPattern::vl16:
+    Idx = 15;
+    break;
+  }
+
+  // Increment the index if extracting the element after the last active
+  // predicate element.
+  if (IsAfter)
+    ++Idx;
+
+  // Ignore extracts whose index is larger than the known minimum vector
+  // length. NOTE: This is an artificial constraint where we prefer to
+  // maintain what the user asked for until an alternative is proven faster.
+  auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
+  if (Idx >= PgVTy->getMinNumElements())
+    return None;
+
+  // The intrinsic is extracting a fixed lane so use an extract instead.
+  auto *IdxTy = Type::getInt64Ty(II.getContext());
+  auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
+  Extract->insertBefore(&II);
+  Extract->takeName(&II);
+  return IC.replaceInstUsesWith(II, Extract);
+}
+
+Optional<Instruction *>
+AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
+                                     IntrinsicInst &II) const {
+  Intrinsic::ID IID = II.getIntrinsicID();
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::aarch64_sve_lasta:
+  case Intrinsic::aarch64_sve_lastb:
+    return instCombineSVELast(IC, II);
+  }
+
+  return None;
+}
+
 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
                                            ArrayRef<const Value *> Args) {
 

diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 0fac4620b0e96..1bc2611dfaa74 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -100,6 +100,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                         TTI::TargetCostKind CostKind);
 
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
     switch (K) {
     case TargetTransformInfo::RGK_Scalar:

diff  --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-lasta-lastb.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-lasta-lastb.ll
new file mode 100644
index 0000000000000..c5919792b45d7
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-lasta-lastb.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -instcombine -S < %s | FileCheck --check-prefix OPT %s
+
+target triple = "aarch64"
+
+; Most of the testing is covered by the lastb cases, but here we ensure that
+; lasta with a predicate having no active lanes is treated as an alias to
+; extracting the first vector element.
+define i8 @lasta_extractelement_0(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lasta_extractelement_0(
+; OPT-NEXT:    [[E0:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 0
+; OPT-NEXT:    ret i8 [[E0]]
+;
+  %e0 = tail call i8 @llvm.aarch64.sve.lasta.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %v)
+  ret i8 %e0
+}
+
+; Most of the testing is covered by the lastb cases, but here we check the
+; resulting extraction index is one more than the lastb case because lasta
+; extracts the element after the last active.
+define i8 @lasta_extractelement_8(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lasta_extractelement_8(
+; OPT-NEXT:    [[E1:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 8
+; OPT-NEXT:    ret i8 [[E1]]
+;
+  %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 8)
+  %e1 = tail call i8 @llvm.aarch64.sve.lasta.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+  ret i8 %e1
+}
+
+define i8 @lastb_extractelement_0(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_0(
+; OPT-NEXT:    [[E0:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 0
+; OPT-NEXT:    ret i8 [[E0]]
+;
+  %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 1)
+  %e0 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+  ret i8 %e0
+}
+
+define i8 @lastb_extractelement_1(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_1(
+; OPT-NEXT:    [[E1:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 1
+; OPT-NEXT:    ret i8 [[E1]]
+;
+  %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 2)
+  %e1 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+  ret i8 %e1
+}
+
+define i8 @lastb_extractelement_2(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_2(
+; OPT-NEXT:    [[E2:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 2
+; OPT-NEXT:    ret i8 [[E2]]
+;
+  %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 3)
+  %e2 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+  ret i8 %e2
+}
+
+define i8 @lastb_extractelement_3(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_3(
+; OPT-NEXT:    [[E3:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 3
+; OPT-NEXT:    ret i8 [[E3]]
+;
+  %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 4)
+  %e3 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+  ret i8 %e3
+}
+
+define i8 @lastb_extractelement_4(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_4(
+; OPT-NEXT:    [[E4:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 4
+; OPT-NEXT:    ret i8 [[E4]]
+;
+  %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 5)
+  %e4 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+  ret i8 %e4
+}
+
+define i8 @lastb_extractelement_5(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_5(
+; OPT-NEXT:    [[E5:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 5
+; OPT-NEXT:    ret i8 [[E5]]
+;
+  %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 6)
+  %e5 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+  ret i8 %e5
+}
+
+define i8 @lastb_extractelement_6(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_6(
+; OPT-NEXT:    [[E6:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 6
+; OPT-NEXT:    ret i8 [[E6]]
+;
+  %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 7)
+  %e6 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+  ret i8 %e6
+}
+
+define i8 @lastb_extractelement_7(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_7(
+; OPT-NEXT:    [[E7:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 7
+; OPT-NEXT:    ret i8 [[E7]]
+;
+  %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 8)
+  %e7 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+  ret i8 %e7
+}
+
+define i8 @lastb_extractelement_15(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_15(
+; OPT-NEXT:    [[E15:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 15
+; OPT-NEXT:    ret i8 [[E15]]
+;
+  %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 9)
+  %e15 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+  ret i8 %e15
+}
+
+; No transformation because the requested element is beyond the range of the
+; known minimum element count so we maintain the user's intentions.
+define i8 @lastb_extractelement_31(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_31(
+; OPT-NEXT:    [[PG:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 10)
+; OPT-NEXT:    [[E31:%.*]] = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[V:%.*]])
+; OPT-NEXT:    ret i8 [[E31]]
+;
+  %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 10)
+  %e31 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+  ret i8 %e31
+}
+
+; No transformation because the ptrue's predicate pattern is bogus and thus
+; nothing can be inferred about the result.
+define i8 @lastb_extractelement_invalid_predicate_pattern(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_invalid_predicate_pattern(
+; OPT-NEXT:    [[PG:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 15)
+; OPT-NEXT:    [[E:%.*]] = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[V:%.*]])
+; OPT-NEXT:    ret i8 [[E]]
+;
+  %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 15)
+  %e = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+  ret i8 %e
+}
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32)
+declare i8 @llvm.aarch64.sve.lasta.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>)
+declare i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>)
+
+attributes #0 = { "target-features"="+sve" }