[llvm] 137459a - [AArch64][SME] Disable (SLP|Loop)Vectorizer when function may be executed in streaming mode.
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 19 09:43:15 PDT 2022
Author: Sander de Smalen
Date: 2022-10-19T16:42:20Z
New Revision: 137459aff6ec2c5f57f978c8feb689fcc8010b62
URL: https://github.com/llvm/llvm-project/commit/137459aff6ec2c5f57f978c8feb689fcc8010b62
DIFF: https://github.com/llvm/llvm-project/commit/137459aff6ec2c5f57f978c8feb689fcc8010b62.diff
LOG: [AArch64][SME] Disable (SLP|Loop)Vectorizer when function may be executed in streaming mode.
When the SME attributes tell that a function is or may be executed in Streaming
SVE mode, we currently need to be conservative and disable _any_ vectorization
(fixed or scalable) because the code-generator does not yet support generating
streaming-compatible code.
Scalable auto-vec will be gradually enabled in the future when we have
confidence that the loop-vectorizer won't use any SVE or NEON instructions
that are illegal in Streaming SVE mode.
Reviewed By: paulwalker-arm
Differential Revision: https://reviews.llvm.org/D135950
Added:
llvm/test/Transforms/LoopVectorize/AArch64/sme-vectorize.ll
Modified:
llvm/lib/Target/AArch64/AArch64Subtarget.cpp
llvm/lib/Target/AArch64/AArch64Subtarget.h
llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 75a0c34c6fe73..9a78cbd45cbd0 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -278,12 +278,14 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
const std::string &FS,
const TargetMachine &TM, bool LittleEndian,
unsigned MinSVEVectorSizeInBitsOverride,
- unsigned MaxSVEVectorSizeInBitsOverride)
+ unsigned MaxSVEVectorSizeInBitsOverride,
+ bool StreamingSVEModeDisabled)
: AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
ReserveXRegisterForRA(AArch64::GPR64commonRegClass.getNumRegs()),
CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
IsLittle(LittleEndian),
+ StreamingSVEModeDisabled(StreamingSVEModeDisabled),
MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)),
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 427a4178670d7..15c3961087d1c 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -121,6 +121,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
bool IsLittle;
+ bool StreamingSVEModeDisabled;
unsigned MinSVEVectorSizeInBits;
unsigned MaxSVEVectorSizeInBits;
unsigned VScaleForTuning = 2;
@@ -158,7 +159,8 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
const std::string &TuneCPU, const std::string &FS,
const TargetMachine &TM, bool LittleEndian,
unsigned MinSVEVectorSizeInBitsOverride = 0,
- unsigned MaxSVEVectorSizeInBitsOverride = 0);
+ unsigned MaxSVEVectorSizeInBitsOverride = 0,
+ bool StreamingSVEModeDisabled = true);
// Getters for SubtargetFeatures defined in tablegen
#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
@@ -198,6 +200,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
bool isXRaySupported() const override { return true; }
unsigned getMinVectorRegisterBitWidth() const {
+ // Don't assume any minimum vector size when PSTATE.SM may not be 0.
+ if (!isStreamingSVEModeDisabled())
+ return 0;
return MinVectorRegisterBitWidth;
}
@@ -391,6 +396,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
return "__security_check_cookie";
}
+ bool isStreamingSVEModeDisabled() const { return StreamingSVEModeDisabled; }
};
} // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index a6f81d8f2cd78..594bb6d97a504 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -388,6 +388,11 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
SmallString<512> Key;
+ bool StreamingSVEModeDisabled =
+ !F.hasFnAttribute("aarch64_pstate_sm_enabled") &&
+ !F.hasFnAttribute("aarch64_pstate_sm_compatible") &&
+ !F.hasFnAttribute("aarch64_pstate_sm_body");
+
unsigned MinSVEVectorSize = 0;
unsigned MaxSVEVectorSize = 0;
Attribute VScaleRangeAttr = F.getFnAttribute(Attribute::VScaleRange);
@@ -420,6 +425,7 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
Key += "SVEMin";
Key += std::to_string(MinSVEVectorSize);
Key += "SVEMax";
+ Key += "StreamingSVEModeDisabled=" + std::to_string(StreamingSVEModeDisabled);
Key += std::to_string(MaxSVEVectorSize);
Key += CPU;
Key += TuneCPU;
@@ -431,9 +437,9 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = std::make_unique<AArch64Subtarget>(TargetTriple, CPU, TuneCPU, FS,
- *this, isLittle, MinSVEVectorSize,
- MaxSVEVectorSize);
+ I = std::make_unique<AArch64Subtarget>(
+ TargetTriple, CPU, TuneCPU, FS, *this, isLittle, MinSVEVectorSize,
+ MaxSVEVectorSize, StreamingSVEModeDisabled);
}
return I.get();
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 9a95d2ae30921..cbb3d793899a9 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -106,6 +106,18 @@ cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding(
"recurrences"),
cl::location(TailFoldingKindLoc));
+// Experimental option that will only be fully functional when the
+// code-generator is changed to use SVE instead of NEON for all fixed-width
+// operations.
+static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
+ "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
+
+// Experimental option that will only be fully functional when the cost-model
+// and code-generator have been changed to avoid using scalable vector
+// instructions that are not legal in streaming SVE mode.
+static cl::opt<bool> EnableScalableAutovecInStreamingMode(
+ "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
+
bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
SMEAttrs CallerAttrs(*Caller);
@@ -1487,6 +1499,30 @@ Optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
return None;
}
+TypeSize
+AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
+ switch (K) {
+ case TargetTransformInfo::RGK_Scalar:
+ return TypeSize::getFixed(64);
+ case TargetTransformInfo::RGK_FixedWidthVector:
+ if (!ST->isStreamingSVEModeDisabled() &&
+ !EnableFixedwidthAutovecInStreamingMode)
+ return TypeSize::getFixed(0);
+
+ if (ST->hasSVE())
+ return TypeSize::getFixed(
+ std::max(ST->getMinSVEVectorSizeInBits(), 128u));
+
+ return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
+ case TargetTransformInfo::RGK_ScalableVector:
+ if (!ST->isStreamingSVEModeDisabled() && !EnableScalableAutovecInStreamingMode)
+ return TypeSize::getScalable(0);
+
+ return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
+ }
+ llvm_unreachable("Unsupported register kind");
+}
+
bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
ArrayRef<const Value *> Args) {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 55961babfac11..7db2a60a66df8 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -120,20 +120,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
std::function<void(Instruction *, unsigned, APInt, APInt &)>
SimplifyAndSetOp) const;
- TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
- switch (K) {
- case TargetTransformInfo::RGK_Scalar:
- return TypeSize::getFixed(64);
- case TargetTransformInfo::RGK_FixedWidthVector:
- if (ST->hasSVE())
- return TypeSize::getFixed(
- std::max(ST->getMinSVEVectorSizeInBits(), 128u));
- return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
- case TargetTransformInfo::RGK_ScalableVector:
- return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
- }
- llvm_unreachable("Unsupported register kind");
- }
+ TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
unsigned getMinVectorRegisterBitWidth() const {
return ST->getMinVectorRegisterBitWidth();
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sme-vectorize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sme-vectorize.ll
new file mode 100644
index 0000000000000..dba56e931a174
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sme-vectorize.ll
@@ -0,0 +1,114 @@
+; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_enabled/ %s | opt -loop-vectorize -slp-vectorizer -S - | FileCheck %s --check-prefix=CHECK
+; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_enabled/ %s | opt -loop-vectorize -slp-vectorizer -S -enable-scalable-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-SCALABLE
+; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_enabled/ %s | opt -loop-vectorize -slp-vectorizer -S -enable-fixedwidth-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-FIXEDWIDTH
+
+; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_compatible/ %s | opt -loop-vectorize -slp-vectorizer -S - | FileCheck %s --check-prefix=CHECK
+; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_compatible/ %s | opt -loop-vectorize -slp-vectorizer -S -enable-scalable-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-SCALABLE
+; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_compatible/ %s | opt -loop-vectorize -slp-vectorizer -S -enable-fixedwidth-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-FIXEDWIDTH
+
+; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_body/ %s | opt -loop-vectorize -slp-vectorizer -S - | FileCheck %s --check-prefix=CHECK
+; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_body/ %s | opt -loop-vectorize -slp-vectorizer -S -enable-scalable-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-SCALABLE
+; RUN: sed -e s/REPLACE_PSTATE_MACRO/aarch64_pstate_sm_body/ %s | opt -loop-vectorize -slp-vectorizer -S -enable-fixedwidth-autovec-in-streaming-mode - | FileCheck %s --check-prefix=CHECK-FORCE-FIXEDWIDTH
+
+target triple = "aarch64-unknown-linux-gnu"
+
+attributes #0 = { vscale_range(1,16) "target-features"="+neon,+sme,+sve2" "REPLACE_PSTATE_MACRO" }
+
+define void @test_fixedwidth_loopvec(ptr noalias %dst, ptr readonly %src, i32 %N) #0 {
+; CHECK-LABEL: @test_fixedwidth_loopvec
+; CHECK-NOT: <{{[1-9]+}} x i32>
+; CHECK-FORCE-FIXEDWIDTH-LABEL: @test_fixedwidth_loopvec
+; CHECK-FORCE-FIXEDWIDTH: <{{[1-9]+}} x i32>
+entry:
+ %cmp6 = icmp sgt i32 %N, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %add = add nsw i32 %0, 42
+ %arrayidx2 = getelementptr inbounds i32, ptr %dst, i64 %indvars.iv
+ store i32 %add, ptr %arrayidx2, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
+}
+
+!0 = distinct !{!0, !1, !2, !3}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.interleave.count", i32 1}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false}
+
+define void @test_scalable_loopvec(ptr noalias %dst, ptr readonly %src, i32 %N) #0 {
+; CHECK-LABEL: @test_scalable_loopvec
+; CHECK-NOT: <vscale x {{[1-9]+}} x i32>
+; CHECK-FORCE-SCALABLE-LABEL: @test_fixedwidth_loopvec
+; CHECK-FORCE-SCALABLE-LABEL: <vscale x {{[1-9]+}} x i32>
+entry:
+ %cmp6 = icmp sgt i32 %N, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %add = add nsw i32 %0, 42
+ %arrayidx2 = getelementptr inbounds i32, ptr %dst, i64 %indvars.iv
+ store i32 %add, ptr %arrayidx2, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !4
+}
+
+!4 = distinct !{!4, !5, !6, !7}
+!5 = !{!"llvm.loop.mustprogress"}
+!6 = !{!"llvm.loop.interleave.count", i32 1}
+!7 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+define void @test_slp(ptr noalias %dst, ptr readonly %src, i32 %N) #0 {
+; CHECK-LABEL: @test_slp
+; CHECK-NOT: <{{[1-9]+}} x i32>
+; CHECK-FORCE-FIXEDWIDTH-LABEL: @test_slp
+; CHECK-FORCE-FIXEDWIDTH: <{{[1-9]+}} x i32>
+entry:
+ %cmp6 = icmp sgt i32 %N, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %add = add nsw i32 %0, 42
+ %arrayidx2 = getelementptr inbounds i32, ptr %dst, i64 %indvars.iv
+ store i32 %add, ptr %arrayidx2, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !8
+}
+
+!8 = distinct !{!8, !9, !10, !11}
+!9 = !{!"llvm.loop.mustprogress"}
+!10 = !{!"llvm.loop.interleave.count", i32 4}
+!11 = !{!"llvm.loop.vectorize.width", i32 1}
More information about the llvm-commits
mailing list