[llvm] c91cd4f - [AArch64][SVE][InstCombine] Replace last{a,b} intrinsics with extracts...
Joe Ellis via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 20 03:01:54 PDT 2021
Author: Joe Ellis
Date: 2021-04-20T10:01:33Z
New Revision: c91cd4f3bb53f6f3b2cbfd6269ebb88eef410246
URL: https://github.com/llvm/llvm-project/commit/c91cd4f3bb53f6f3b2cbfd6269ebb88eef410246
DIFF: https://github.com/llvm/llvm-project/commit/c91cd4f3bb53f6f3b2cbfd6269ebb88eef410246.diff
LOG: [AArch64][SVE][InstCombine] Replace last{a,b} intrinsics with extracts...
when the predicate used by last{a,b} specifies a known vector length.
For example:
aarch64_sve_lasta(VL1, D) -> extractelement(D, #1)
aarch64_sve_lastb(VL1, D) -> extractelement(D, #0)
Co-authored-by: Paul Walker <paul.walker at arm.com>
Differential Revision: https://reviews.llvm.org/D100476
Added:
llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-lasta-lastb.ll
Modified:
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 754e4cf207e7c..f98a01e94a20c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -18,6 +18,7 @@
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include <algorithm>
using namespace llvm;
using namespace llvm::PatternMatch;
@@ -279,6 +280,101 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
+static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
+ IntrinsicInst &II) {
+ Value *Pg = II.getArgOperand(0);
+ Value *Vec = II.getArgOperand(1);
+ bool IsAfter = II.getIntrinsicID() == Intrinsic::aarch64_sve_lasta;
+
+ auto *C = dyn_cast<Constant>(Pg);
+ if (IsAfter && C && C->isNullValue()) {
+ // The intrinsic is extracting lane 0 so use an extract instead.
+ auto *IdxTy = Type::getInt64Ty(II.getContext());
+ auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
+ Extract->insertBefore(&II);
+ Extract->takeName(&II);
+ return IC.replaceInstUsesWith(II, Extract);
+ }
+
+ auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
+ if (!IntrPG)
+ return None;
+
+ if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
+ return None;
+
+ const auto PTruePattern =
+ cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
+
+ // Can the intrinsic's predicate be converted to a known constant index?
+ unsigned Idx;
+ switch (PTruePattern) {
+ default:
+ return None;
+ case AArch64SVEPredPattern::vl1:
+ Idx = 0;
+ break;
+ case AArch64SVEPredPattern::vl2:
+ Idx = 1;
+ break;
+ case AArch64SVEPredPattern::vl3:
+ Idx = 2;
+ break;
+ case AArch64SVEPredPattern::vl4:
+ Idx = 3;
+ break;
+ case AArch64SVEPredPattern::vl5:
+ Idx = 4;
+ break;
+ case AArch64SVEPredPattern::vl6:
+ Idx = 5;
+ break;
+ case AArch64SVEPredPattern::vl7:
+ Idx = 6;
+ break;
+ case AArch64SVEPredPattern::vl8:
+ Idx = 7;
+ break;
+ case AArch64SVEPredPattern::vl16:
+ Idx = 15;
+ break;
+ }
+
+ // Increment the index if extracting the element after the last active
+ // predicate element.
+ if (IsAfter)
+ ++Idx;
+
+ // Ignore extracts whose index is larger than the known minimum vector
+ // length. NOTE: This is an artificial constraint where we prefer to
+ // maintain what the user asked for until an alternative is proven faster.
+ auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
+ if (Idx >= PgVTy->getMinNumElements())
+ return None;
+
+ // The intrinsic is extracting a fixed lane so use an extract instead.
+ auto *IdxTy = Type::getInt64Ty(II.getContext());
+ auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
+ Extract->insertBefore(&II);
+ Extract->takeName(&II);
+ return IC.replaceInstUsesWith(II, Extract);
+}
+
+Optional<Instruction *>
+AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
+ IntrinsicInst &II) const {
+ Intrinsic::ID IID = II.getIntrinsicID();
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::aarch64_sve_lasta:
+ case Intrinsic::aarch64_sve_lastb:
+ return instCombineSVELast(IC, II);
+ }
+
+ return None;
+}
+
bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
ArrayRef<const Value *> Args) {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 0fac4620b0e96..1bc2611dfaa74 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -100,6 +100,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind);
+ Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+ IntrinsicInst &II) const;
+
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
switch (K) {
case TargetTransformInfo::RGK_Scalar:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-lasta-lastb.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-lasta-lastb.ll
new file mode 100644
index 0000000000000..c5919792b45d7
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-lasta-lastb.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -instcombine -S < %s | FileCheck --check-prefix OPT %s
+
+target triple = "aarch64"
+
+; Most of the testing is covered by the lastb cases, but here we ensure that
+; lasta with a predicate having no active lanes is treated as an alias to
+; extracting the first vector element.
+define i8 @lasta_extractelement_0(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lasta_extractelement_0(
+; OPT-NEXT: [[E0:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 0
+; OPT-NEXT: ret i8 [[E0]]
+;
+ %e0 = tail call i8 @llvm.aarch64.sve.lasta.nxv16i8(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i8> %v)
+ ret i8 %e0
+}
+
+; Most of the testing is covered by the lastb cases, but here we check the
+; resulting extraction index is one more than the lastb case because lasta
+; extracts the element after the last active.
+define i8 @lasta_extractelement_8(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lasta_extractelement_8(
+; OPT-NEXT: [[E1:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 8
+; OPT-NEXT: ret i8 [[E1]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 8)
+ %e1 = tail call i8 @llvm.aarch64.sve.lasta.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e1
+}
+
+define i8 @lastb_extractelement_0(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_0(
+; OPT-NEXT: [[E0:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 0
+; OPT-NEXT: ret i8 [[E0]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 1)
+ %e0 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e0
+}
+
+define i8 @lastb_extractelement_1(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_1(
+; OPT-NEXT: [[E1:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 1
+; OPT-NEXT: ret i8 [[E1]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 2)
+ %e1 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e1
+}
+
+define i8 @lastb_extractelement_2(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_2(
+; OPT-NEXT: [[E2:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 2
+; OPT-NEXT: ret i8 [[E2]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 3)
+ %e2 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e2
+}
+
+define i8 @lastb_extractelement_3(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_3(
+; OPT-NEXT: [[E3:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 3
+; OPT-NEXT: ret i8 [[E3]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 4)
+ %e3 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e3
+}
+
+define i8 @lastb_extractelement_4(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_4(
+; OPT-NEXT: [[E4:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 4
+; OPT-NEXT: ret i8 [[E4]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 5)
+ %e4 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e4
+}
+
+define i8 @lastb_extractelement_5(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_5(
+; OPT-NEXT: [[E5:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 5
+; OPT-NEXT: ret i8 [[E5]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 6)
+ %e5 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e5
+}
+
+define i8 @lastb_extractelement_6(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_6(
+; OPT-NEXT: [[E6:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 6
+; OPT-NEXT: ret i8 [[E6]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 7)
+ %e6 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e6
+}
+
+define i8 @lastb_extractelement_7(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_7(
+; OPT-NEXT: [[E7:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 7
+; OPT-NEXT: ret i8 [[E7]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 8)
+ %e7 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e7
+}
+
+define i8 @lastb_extractelement_15(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_15(
+; OPT-NEXT: [[E15:%.*]] = extractelement <vscale x 16 x i8> [[V:%.*]], i64 15
+; OPT-NEXT: ret i8 [[E15]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 9)
+ %e15 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e15
+}
+
+; No transformation because the requested element is beyond the range of the
+; known minimum element count so we maintain the user's intentions.
+define i8 @lastb_extractelement_31(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_31(
+; OPT-NEXT: [[PG:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 10)
+; OPT-NEXT: [[E31:%.*]] = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[V:%.*]])
+; OPT-NEXT: ret i8 [[E31]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 10)
+ %e31 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e31
+}
+
+; No transformation because the ptrue's predicate pattern is bogus and thus
+; nothing can be inferred about the result.
+define i8 @lastb_extractelement_invalid_predicate_pattern(<vscale x 16 x i8> %v) #0 {
+; OPT-LABEL: @lastb_extractelement_invalid_predicate_pattern(
+; OPT-NEXT: [[PG:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 15)
+; OPT-NEXT: [[E:%.*]] = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> [[PG]], <vscale x 16 x i8> [[V:%.*]])
+; OPT-NEXT: ret i8 [[E]]
+;
+ %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 15)
+ %e = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %v)
+ ret i8 %e
+}
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32)
+declare i8 @llvm.aarch64.sve.lasta.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>)
+declare i8 @llvm.aarch64.sve.lastb.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>)
+
+attributes #0 = { "target-features"="+sve" }
More information about the llvm-commits
mailing list