[libc-commits] [libcxx] [flang] [libc] [clang] [clang-tools-extra] [lld] [lldb] [llvm] [AArch64][MachinePipeliner] Add pipeliner support for AArch64 (PR #79589)
Yuta Mukai via libc-commits
libc-commits at lists.llvm.org
Thu Feb 1 16:19:54 PST 2024
https://github.com/ytmukai updated https://github.com/llvm/llvm-project/pull/79589
>From bcdb1e47ce841df96b2916d61cda018503f62358 Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Tue, 12 Dec 2023 16:59:09 +0000
Subject: [PATCH] [AArch64][MachinePipeliner] Add pipeliner support for AArch64
Add AArch64 implementations for the interfaces of MachinePipeliner
pass. The pass is disabled by default for AArch64. It is enabled by
specifying --aarch64-enable-pipeliner.
5 tests in llvm-test-suites show performance improvement by more than
5% on a Neoverse V1 processor.
| test | improvement |
| ---------------------------------------------------------------- | -----------:|
| MultiSource/Benchmarks/TSVC/Recurrences-dbl/Recurrences-dbl.test | 16% |
| MultiSource/Benchmarks/TSVC/Recurrences-dbl/Recurrences-flt.test | 16% |
| SingleSource/Benchmarks/Adobe-C++/loop_unroll.test | 14% |
| SingleSource/Benchmarks/Misc/flops-5.test | 13% |
| SingleSource/Benchmarks/BenchmarkGame/spectral-norm.test | 6% |
(base flags: -mcpu=neoverse-v1 -O3 -mrecip, flags for pipelining:
-mllvm -aarch64-enable-pipeliner -mllvm
-pipeliner-max-stages=100 -mllvm -pipeliner-max-mii=100 -mllvm
-pipeliner-enable-copytophi=0)
On the other hand, there are cases of significant performance
degradation. Algorithm improvements and adding the option/pragma will
be needed in the future.
---
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 105 ++++++++++++++++++
llvm/lib/Target/AArch64/AArch64InstrInfo.h | 4 +
llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 4 +
llvm/lib/Target/AArch64/AArch64Subtarget.h | 3 +
.../Target/AArch64/AArch64TargetMachine.cpp | 7 ++
.../CodeGen/AArch64/sms-acceptable-loop1.mir | 78 +++++++++++++
.../CodeGen/AArch64/sms-acceptable-loop2.mir | 78 +++++++++++++
.../CodeGen/AArch64/sms-acceptable-loop3.mir | 79 +++++++++++++
.../CodeGen/AArch64/sms-acceptable-loop4.mir | 79 +++++++++++++
.../AArch64/sms-unacceptable-loop1.mir | 77 +++++++++++++
.../AArch64/sms-unacceptable-loop2.mir | 80 +++++++++++++
.../CodeGen/AArch64/sms-unpipeline-insts1.mir | 87 +++++++++++++++
.../CodeGen/AArch64/sms-unpipeline-insts2.mir | 80 +++++++++++++
13 files changed, 761 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/sms-acceptable-loop1.mir
create mode 100644 llvm/test/CodeGen/AArch64/sms-acceptable-loop2.mir
create mode 100644 llvm/test/CodeGen/AArch64/sms-acceptable-loop3.mir
create mode 100644 llvm/test/CodeGen/AArch64/sms-acceptable-loop4.mir
create mode 100644 llvm/test/CodeGen/AArch64/sms-unacceptable-loop1.mir
create mode 100644 llvm/test/CodeGen/AArch64/sms-unacceptable-loop2.mir
create mode 100644 llvm/test/CodeGen/AArch64/sms-unpipeline-insts1.mir
create mode 100644 llvm/test/CodeGen/AArch64/sms-unpipeline-insts2.mir
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 8e50c16ba0887..809c3415ea234 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -9608,6 +9608,111 @@ AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
return ExitMBB->begin();
}
+namespace {
+class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
+ MachineInstr *PredBranch;
+ SmallVector<MachineOperand, 4> Cond;
+
+public:
+ AArch64PipelinerLoopInfo(MachineInstr *PredBranch,
+ const SmallVectorImpl<MachineOperand> &Cond)
+ : PredBranch(PredBranch), Cond(Cond.begin(), Cond.end()) {}
+
+ bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
+ // Make the instructions for loop control be placed in stage 0.
+ // The predecessors of PredBranch are considered by the caller.
+ return MI == PredBranch;
+ }
+
+ std::optional<bool> createTripCountGreaterCondition(
+ int TC, MachineBasicBlock &MBB,
+ SmallVectorImpl<MachineOperand> &CondParam) override {
+ // A branch instruction will be inserted as "if (Cond) goto epilogue".
+ // Cond is normalized for such use.
+ // The predecessors of the branch are assumed to have already been inserted.
+ CondParam = Cond;
+ return {};
+ }
+
+ void setPreheader(MachineBasicBlock *NewPreheader) override {}
+
+ void adjustTripCount(int TripCountAdjust) override {}
+
+ void disposed() override {}
+};
+} // namespace
+
+static bool isCompareAndBranch(unsigned Opcode) {
+ switch (Opcode) {
+ case AArch64::CBZW:
+ case AArch64::CBZX:
+ case AArch64::CBNZW:
+ case AArch64::CBNZX:
+ case AArch64::TBZW:
+ case AArch64::TBZX:
+ case AArch64::TBNZW:
+ case AArch64::TBNZX:
+ return true;
+ }
+ return false;
+}
+
+std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ SmallVector<MachineOperand, 4> Cond;
+ if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
+ return nullptr;
+
+ // Infinite loops are not supported
+ if (TBB == LoopBB && FBB == LoopBB)
+ return nullptr;
+
+ // Must be conditional branch
+ if (FBB == nullptr)
+ return nullptr;
+
+ assert((TBB == LoopBB || FBB == LoopBB) &&
+ "The Loop must be a single-basic-block loop");
+
+ // Normalization for createTripCountGreaterCondition()
+ if (TBB == LoopBB)
+ reverseBranchCondition(Cond);
+
+ MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
+ const TargetRegisterInfo &TRI = getRegisterInfo();
+
+ // Find the immediate predecessor of the conditional branch
+ MachineInstr *PredBranch = nullptr;
+ if (CondBranch->getOpcode() == AArch64::Bcc) {
+ for (MachineInstr &MI : reverse(*LoopBB)) {
+ if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
+ PredBranch = &MI;
+ break;
+ }
+ }
+ if (!PredBranch)
+ return nullptr;
+ } else if (isCompareAndBranch(CondBranch->getOpcode())) {
+ const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
+ Register Reg = CondBranch->getOperand(0).getReg();
+ if (!Reg.isVirtual())
+ return nullptr;
+ PredBranch = MRI.getVRegDef(Reg);
+
+ // MachinePipeliner does not expect that the immediate predecessor is a Phi
+ if (PredBranch->isPHI())
+ return nullptr;
+
+ if (PredBranch->getParent() != LoopBB)
+ return nullptr;
+ } else {
+ return nullptr;
+ }
+
+ return std::make_unique<AArch64PipelinerLoopInfo>(PredBranch, Cond);
+}
+
#define GET_INSTRINFO_HELPERS
#define GET_INSTRMAP_INFO
#include "AArch64GenInstrInfo.inc"
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 6526f6740747a..f3a5db4367509 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -247,6 +247,10 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
const DebugLoc &DL,
int *BytesAdded = nullptr) const override;
+
+ std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+ analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override;
+
bool
reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index e3a0606331db1..6550c12722166 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -540,3 +540,7 @@ AArch64Subtarget::getAuthenticatedLRCheckMethod() const {
// performance regression or incompatibility with execute-only mappings.
return AArch64PAuth::AuthCheckMethod::None;
}
+
+bool AArch64Subtarget::enableMachinePipeliner() const {
+ return getSchedModel().hasInstrSchedModel();
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 16864102df59b..0292c018f1dbc 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -201,6 +201,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
bool enableMachineScheduler() const override { return true; }
bool enablePostRAScheduler() const override { return usePostRAScheduler(); }
+ bool enableMachinePipeliner() const override;
+ bool useDFAforSMS() const override { return false; }
+
/// Returns ARM processor family.
/// Avoid this function! CPU specifics should be kept local to this class
/// and preferably modeled with SubtargetFeatures or properties in
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 6fbc13d8904f2..81bb6e59422fa 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -204,6 +204,11 @@ static cl::opt<bool>
cl::desc("Enable sinking and folding of instruction copies"),
cl::init(true), cl::Hidden);
+static cl::opt<bool>
+ EnableMachinePipeliner("aarch64-enable-pipeliner",
+ cl::desc("Enable Machine Pipeliner for AArch64"),
+ cl::init(false), cl::Hidden);
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
// Register the target.
RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
@@ -779,6 +784,8 @@ void AArch64PassConfig::addPreRegAlloc() {
// be register coalescer friendly.
addPass(&PeepholeOptimizerID);
}
+ if (TM->getOptLevel() != CodeGenOptLevel::None && EnableMachinePipeliner)
+ addPass(&MachinePipelinerID);
}
void AArch64PassConfig::addPostRegAlloc() {
diff --git a/llvm/test/CodeGen/AArch64/sms-acceptable-loop1.mir b/llvm/test/CodeGen/AArch64/sms-acceptable-loop1.mir
new file mode 100644
index 0000000000000..ed2bd73a7861a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-acceptable-loop1.mir
@@ -0,0 +1,78 @@
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -debug-only=pipeliner 2>&1 | FileCheck %s
+
+# An acceptable loop by pipeliner: TBB == ExitBB, FBB == LoopBB, Branch with NZCV flags
+# CHECK: Schedule Found? 1
+
+--- |
+ define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
+ entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+ for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext nneg i32 %n to i64
+ br label %for.body
+
+ for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+ for.body: ; preds = %for.body.preheader, %for.body
+ %lsr.iv11 = phi i64 [ %wide.trip.count, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+ %lsr.iv9 = phi ptr [ %b, %for.body.preheader ], [ %scevgep10, %for.body ]
+ %lsr.iv = phi ptr [ %a, %for.body.preheader ], [ %scevgep, %for.body ]
+ %0 = load float, ptr %lsr.iv9, align 4
+ %add = fadd float %0, 1.000000e+00
+ store float %add, ptr %lsr.iv, align 4
+ %scevgep = getelementptr i8, ptr %lsr.iv, i64 4
+ %scevgep10 = getelementptr i8, ptr %lsr.iv9, i64 4
+ %lsr.iv.next = add nsw i64 %lsr.iv11, -1
+ %exitcond.not = icmp eq i64 %lsr.iv.next, 0
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+ }
+
+...
+---
+name: func
+tracksRegLiveness: true
+liveins:
+ - { reg: '$x0', virtual-reg: '%7' }
+ - { reg: '$x1', virtual-reg: '%8' }
+ - { reg: '$w2', virtual-reg: '%9' }
+body: |
+ bb.0.entry:
+ successors: %bb.1(0x50000000), %bb.2(0x30000000)
+ liveins: $x0, $x1, $w2
+
+ %9:gpr32common = COPY $w2
+ %8:gpr64 = COPY $x1
+ %7:gpr64 = COPY $x0
+ dead $wzr = SUBSWri %9, 1, 0, implicit-def $nzcv
+ Bcc 11, %bb.2, implicit $nzcv
+ B %bb.1
+
+ bb.1.for.body.preheader:
+ %11:gpr32 = ORRWrs $wzr, %9, 0
+ %0:gpr64all = SUBREG_TO_REG 0, killed %11, %subreg.sub_32
+ %14:fpr32 = FMOVSi 112
+ B %bb.3
+
+ bb.2.for.cond.cleanup:
+ RET_ReallyLR
+
+ bb.3.for.body:
+ successors: %bb.2(0x04000000), %bb.3(0x7c000000)
+
+ %1:gpr64sp = PHI %0, %bb.1, %6, %bb.3
+ %2:gpr64sp = PHI %8, %bb.1, %5, %bb.3
+ %3:gpr64sp = PHI %7, %bb.1, %4, %bb.3
+ early-clobber %12:gpr64sp, %13:fpr32 = LDRSpost %2, 4 :: (load (s32) from %ir.lsr.iv9)
+ %15:fpr32 = nofpexcept FADDSrr killed %13, %14, implicit $fpcr
+ early-clobber %16:gpr64sp = STRSpost killed %15, %3, 4 :: (store (s32) into %ir.lsr.iv)
+ %4:gpr64all = COPY %16
+ %5:gpr64all = COPY %12
+ %17:gpr64 = nsw SUBSXri %1, 1, 0, implicit-def $nzcv
+ %6:gpr64all = COPY %17
+ Bcc 0, %bb.2, implicit $nzcv
+ B %bb.3
+
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-acceptable-loop2.mir b/llvm/test/CodeGen/AArch64/sms-acceptable-loop2.mir
new file mode 100644
index 0000000000000..5cf6367354ecc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-acceptable-loop2.mir
@@ -0,0 +1,78 @@
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -debug-only=pipeliner 2>&1 | FileCheck %s
+
+# An acceptable loop by pipeliner: TBB == LoopBB, FBB == ExitBB, Branch with NZCV flags
+# CHECK: Schedule Found? 1
+
+--- |
+ define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
+ entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+ for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext nneg i32 %n to i64
+ br label %for.body
+
+ for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+ for.body: ; preds = %for.body.preheader, %for.body
+ %lsr.iv11 = phi i64 [ %wide.trip.count, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+ %lsr.iv9 = phi ptr [ %b, %for.body.preheader ], [ %scevgep10, %for.body ]
+ %lsr.iv = phi ptr [ %a, %for.body.preheader ], [ %scevgep, %for.body ]
+ %0 = load float, ptr %lsr.iv9, align 4
+ %add = fadd float %0, 1.000000e+00
+ store float %add, ptr %lsr.iv, align 4
+ %scevgep = getelementptr i8, ptr %lsr.iv, i64 4
+ %scevgep10 = getelementptr i8, ptr %lsr.iv9, i64 4
+ %lsr.iv.next = add nsw i64 %lsr.iv11, -1
+ %exitcond.not = icmp eq i64 %lsr.iv.next, 0
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+ }
+
+...
+---
+name: func
+tracksRegLiveness: true
+liveins:
+ - { reg: '$x0', virtual-reg: '%7' }
+ - { reg: '$x1', virtual-reg: '%8' }
+ - { reg: '$w2', virtual-reg: '%9' }
+body: |
+ bb.0.entry:
+ successors: %bb.1(0x50000000), %bb.2(0x30000000)
+ liveins: $x0, $x1, $w2
+
+ %9:gpr32common = COPY $w2
+ %8:gpr64 = COPY $x1
+ %7:gpr64 = COPY $x0
+ dead $wzr = SUBSWri %9, 1, 0, implicit-def $nzcv
+ Bcc 11, %bb.2, implicit $nzcv
+ B %bb.1
+
+ bb.1.for.body.preheader:
+ %11:gpr32 = ORRWrs $wzr, %9, 0
+ %0:gpr64all = SUBREG_TO_REG 0, killed %11, %subreg.sub_32
+ %14:fpr32 = FMOVSi 112
+ B %bb.3
+
+ bb.2.for.cond.cleanup:
+ RET_ReallyLR
+
+ bb.3.for.body:
+ successors: %bb.2(0x04000000), %bb.3(0x7c000000)
+
+ %1:gpr64sp = PHI %0, %bb.1, %6, %bb.3
+ %2:gpr64sp = PHI %8, %bb.1, %5, %bb.3
+ %3:gpr64sp = PHI %7, %bb.1, %4, %bb.3
+ early-clobber %12:gpr64sp, %13:fpr32 = LDRSpost %2, 4 :: (load (s32) from %ir.lsr.iv9)
+ %15:fpr32 = nofpexcept FADDSrr killed %13, %14, implicit $fpcr
+ early-clobber %16:gpr64sp = STRSpost killed %15, %3, 4 :: (store (s32) into %ir.lsr.iv)
+ %4:gpr64all = COPY %16
+ %5:gpr64all = COPY %12
+ %17:gpr64 = nsw SUBSXri %1, 1, 0, implicit-def $nzcv
+ %6:gpr64all = COPY %17
+ Bcc 1, %bb.3, implicit $nzcv
+ B %bb.2
+
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-acceptable-loop3.mir b/llvm/test/CodeGen/AArch64/sms-acceptable-loop3.mir
new file mode 100644
index 0000000000000..652770e3fcfa8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-acceptable-loop3.mir
@@ -0,0 +1,79 @@
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-enable-copytophi=0 -debug-only=pipeliner 2>&1 | FileCheck %s
+
+# An acceptable loop by pipeliner: TBB == ExitBB, FBB == LoopBB, Compare and branch
+# CHECK: Schedule Found? 1
+
+--- |
+ define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
+ entry:
+ %or.cond = icmp ult i32 %n, 2
+ br i1 %or.cond, label %for.end, label %for.body.preheader
+
+ for.body.preheader: ; preds = %entry
+ %i.07 = add i32 %n, -1
+ %0 = sext i32 %i.07 to i64
+ br label %for.body
+
+ for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %1 = shl nsw i64 %indvars.iv, 2
+ %scevgep = getelementptr i8, ptr %b, i64 %1
+ %2 = load float, ptr %scevgep, align 4
+ %add = fadd float %2, 1.000000e+00
+ %3 = shl nsw i64 %indvars.iv, 2
+ %scevgep11 = getelementptr i8, ptr %a, i64 %3
+ store float %add, ptr %scevgep11, align 4
+ %indvars.iv.next = add nsw i64 %indvars.iv, -1
+ %4 = add i64 %indvars.iv, -1
+ %5 = and i64 %4, 4294967295
+ %tobool.not = icmp eq i64 %5, 0
+ br i1 %tobool.not, label %for.end, label %for.body
+
+ for.end: ; preds = %for.body, %entry
+ ret void
+ }
+
+...
+---
+name: func
+tracksRegLiveness: true
+liveins:
+ - { reg: '$x0', virtual-reg: '%3' }
+ - { reg: '$x1', virtual-reg: '%4' }
+ - { reg: '$w2', virtual-reg: '%5' }
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $w2
+
+ %5:gpr32common = COPY $w2
+ %4:gpr64common = COPY $x1
+ %3:gpr64common = COPY $x0
+ dead $wzr = SUBSWri %5, 2, 0, implicit-def $nzcv
+ Bcc 3, %bb.3, implicit $nzcv
+ B %bb.1
+
+ bb.1.for.body.preheader:
+ %7:gpr32common = SUBWri %5, 1, 0
+ %9:gpr64all = IMPLICIT_DEF
+ %8:gpr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32
+ %10:gpr64 = SBFMXri killed %8, 0, 31
+ %0:gpr64all = COPY %10
+ %12:fpr32 = FMOVSi 112
+
+ bb.2.for.body:
+ successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+
+ %1:gpr64common = PHI %0, %bb.1, %2, %bb.2
+ %11:fpr32 = LDRSroX %4, %1, 0, 1 :: (load (s32) from %ir.scevgep)
+ %13:fpr32 = nofpexcept FADDSrr killed %11, %12, implicit $fpcr
+ STRSroX killed %13, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep11)
+ %14:gpr64common = SUBXri %1, 1, 0
+ %2:gpr64all = COPY %14
+ %15:gpr32 = COPY %14.sub_32
+ CBZW killed %15, %bb.3
+ B %bb.2
+
+ bb.3.for.end:
+ RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-acceptable-loop4.mir b/llvm/test/CodeGen/AArch64/sms-acceptable-loop4.mir
new file mode 100644
index 0000000000000..95d64cae5b780
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-acceptable-loop4.mir
@@ -0,0 +1,79 @@
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-enable-copytophi=0 -debug-only=pipeliner 2>&1 | FileCheck %s
+
+# An acceptable loop by pipeliner TBB == LoopBB, FBB == ExitBB, Compare and branch
+# CHECK: Schedule Found? 1
+
+--- |
+ define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
+ entry:
+ %or.cond = icmp ult i32 %n, 2
+ br i1 %or.cond, label %for.end, label %for.body.preheader
+
+ for.body.preheader: ; preds = %entry
+ %i.07 = add i32 %n, -1
+ %0 = sext i32 %i.07 to i64
+ br label %for.body
+
+ for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %1 = shl nsw i64 %indvars.iv, 2
+ %scevgep = getelementptr i8, ptr %b, i64 %1
+ %2 = load float, ptr %scevgep, align 4
+ %add = fadd float %2, 1.000000e+00
+ %3 = shl nsw i64 %indvars.iv, 2
+ %scevgep11 = getelementptr i8, ptr %a, i64 %3
+ store float %add, ptr %scevgep11, align 4
+ %indvars.iv.next = add nsw i64 %indvars.iv, -1
+ %4 = add i64 %indvars.iv, -1
+ %5 = and i64 %4, 4294967295
+ %tobool.not = icmp eq i64 %5, 0
+ br i1 %tobool.not, label %for.end, label %for.body
+
+ for.end: ; preds = %for.body, %entry
+ ret void
+ }
+
+...
+---
+name: func
+tracksRegLiveness: true
+liveins:
+ - { reg: '$x0', virtual-reg: '%3' }
+ - { reg: '$x1', virtual-reg: '%4' }
+ - { reg: '$w2', virtual-reg: '%5' }
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $w2
+
+ %5:gpr32common = COPY $w2
+ %4:gpr64common = COPY $x1
+ %3:gpr64common = COPY $x0
+ dead $wzr = SUBSWri %5, 2, 0, implicit-def $nzcv
+ Bcc 3, %bb.3, implicit $nzcv
+ B %bb.1
+
+ bb.1.for.body.preheader:
+ %7:gpr32common = SUBWri %5, 1, 0
+ %9:gpr64all = IMPLICIT_DEF
+ %8:gpr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32
+ %10:gpr64 = SBFMXri killed %8, 0, 31
+ %0:gpr64all = COPY %10
+ %12:fpr32 = FMOVSi 112
+
+ bb.2.for.body:
+ successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+
+ %1:gpr64common = PHI %0, %bb.1, %2, %bb.2
+ %11:fpr32 = LDRSroX %4, %1, 0, 1 :: (load (s32) from %ir.scevgep)
+ %13:fpr32 = nofpexcept FADDSrr killed %11, %12, implicit $fpcr
+ STRSroX killed %13, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep11)
+ %14:gpr64common = SUBXri %1, 1, 0
+ %2:gpr64all = COPY %14
+ %15:gpr32 = COPY %14.sub_32
+ CBNZW killed %15, %bb.2
+ B %bb.3
+
+ bb.3.for.end:
+ RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-unacceptable-loop1.mir b/llvm/test/CodeGen/AArch64/sms-unacceptable-loop1.mir
new file mode 100644
index 0000000000000..79dc1482c748f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-unacceptable-loop1.mir
@@ -0,0 +1,77 @@
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -debug-only=pipeliner 2>&1 | FileCheck %s
+
+# An unacceptable loop by pipeliner: No exits
+# CHECK: Unable to analyzeLoop, can NOT pipeline Loop
+
+--- |
+ define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
+ entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+ for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext nneg i32 %n to i64
+ br label %for.body
+
+ for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+ for.body: ; preds = %for.body.preheader, %for.body
+ %lsr.iv11 = phi i64 [ %wide.trip.count, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+ %lsr.iv9 = phi ptr [ %b, %for.body.preheader ], [ %scevgep10, %for.body ]
+ %lsr.iv = phi ptr [ %a, %for.body.preheader ], [ %scevgep, %for.body ]
+ %0 = load float, ptr %lsr.iv9, align 4
+ %add = fadd float %0, 1.000000e+00
+ store float %add, ptr %lsr.iv, align 4
+ %scevgep = getelementptr i8, ptr %lsr.iv, i64 4
+ %scevgep10 = getelementptr i8, ptr %lsr.iv9, i64 4
+ %lsr.iv.next = add nsw i64 %lsr.iv11, -1
+ %exitcond.not = icmp eq i64 %lsr.iv.next, 0
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+ }
+
+...
+---
+name: func
+tracksRegLiveness: true
+liveins:
+ - { reg: '$x0', virtual-reg: '%7' }
+ - { reg: '$x1', virtual-reg: '%8' }
+ - { reg: '$w2', virtual-reg: '%9' }
+body: |
+ bb.0.entry:
+ successors: %bb.1(0x50000000), %bb.2(0x30000000)
+ liveins: $x0, $x1, $w2
+
+ %9:gpr32common = COPY $w2
+ %8:gpr64 = COPY $x1
+ %7:gpr64 = COPY $x0
+ dead $wzr = SUBSWri %9, 1, 0, implicit-def $nzcv
+ Bcc 11, %bb.2, implicit $nzcv
+ B %bb.1
+
+ bb.1.for.body.preheader:
+ %11:gpr32 = ORRWrs $wzr, %9, 0
+ %0:gpr64all = SUBREG_TO_REG 0, killed %11, %subreg.sub_32
+ %14:fpr32 = FMOVSi 112
+ B %bb.3
+
+ bb.2.for.cond.cleanup:
+ RET_ReallyLR
+
+ bb.3.for.body:
+ successors: %bb.3(0x7c000000)
+
+ %1:gpr64sp = PHI %0, %bb.1, %6, %bb.3
+ %2:gpr64sp = PHI %8, %bb.1, %5, %bb.3
+ %3:gpr64sp = PHI %7, %bb.1, %4, %bb.3
+ early-clobber %12:gpr64sp, %13:fpr32 = LDRSpost %2, 4 :: (load (s32) from %ir.lsr.iv9)
+ %15:fpr32 = nofpexcept FADDSrr killed %13, %14, implicit $fpcr
+ early-clobber %16:gpr64sp = STRSpost killed %15, %3, 4 :: (store (s32) into %ir.lsr.iv)
+ %4:gpr64all = COPY %16
+ %5:gpr64all = COPY %12
+ %17:gpr64 = nsw SUBSXri %1, 1, 0, implicit-def $nzcv
+ %6:gpr64all = COPY %17
+ B %bb.3
+
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-unacceptable-loop2.mir b/llvm/test/CodeGen/AArch64/sms-unacceptable-loop2.mir
new file mode 100644
index 0000000000000..c3807ae272c6c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-unacceptable-loop2.mir
@@ -0,0 +1,80 @@
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-enable-copytophi=0 -debug-only=pipeliner 2>&1 | FileCheck %s
+
+# An unacceptable loop by pipeliner: The operand of the compare and branch is not defined in the loop
+# CHECK: Unable to analyzeLoop, can NOT pipeline Loop
+
+--- |
+ define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
+ entry:
+ %or.cond = icmp ult i32 %n, 2
+ br i1 %or.cond, label %for.end, label %for.body.preheader
+
+ for.body.preheader: ; preds = %entry
+ %i.07 = add i32 %n, -1
+ %0 = sext i32 %i.07 to i64
+ br label %for.body
+
+ for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %1 = shl nsw i64 %indvars.iv, 2
+ %scevgep = getelementptr i8, ptr %b, i64 %1
+ %2 = load float, ptr %scevgep, align 4
+ %add = fadd float %2, 1.000000e+00
+ %3 = shl nsw i64 %indvars.iv, 2
+ %scevgep11 = getelementptr i8, ptr %a, i64 %3
+ store float %add, ptr %scevgep11, align 4
+ %indvars.iv.next = add nsw i64 %indvars.iv, -1
+ %4 = add i64 %indvars.iv, -1
+ %5 = and i64 %4, 4294967295
+ %tobool.not = icmp eq i64 %5, 0
+ br i1 %tobool.not, label %for.end, label %for.body
+
+ for.end: ; preds = %for.body, %entry
+ ret void
+ }
+
+...
+---
+name: func
+tracksRegLiveness: true
+liveins:
+ - { reg: '$x0', virtual-reg: '%3' }
+ - { reg: '$x1', virtual-reg: '%4' }
+ - { reg: '$w2', virtual-reg: '%5' }
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $w2
+
+ %5:gpr32common = COPY $w2
+ %4:gpr64common = COPY $x1
+ %3:gpr64common = COPY $x0
+ dead $wzr = SUBSWri %5, 2, 0, implicit-def $nzcv
+ Bcc 3, %bb.3, implicit $nzcv
+ B %bb.1
+
+ bb.1.for.body.preheader:
+ %7:gpr32common = SUBWri %5, 1, 0
+ %9:gpr64all = IMPLICIT_DEF
+ %8:gpr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32
+ %10:gpr64 = SBFMXri killed %8, 0, 31
+ %0:gpr64all = COPY %10
+ %12:fpr32 = FMOVSi 112
+ %16:gpr32 = COPY %10.sub_32
+
+ bb.2.for.body:
+ successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+
+ %1:gpr64common = PHI %0, %bb.1, %2, %bb.2
+ %11:fpr32 = LDRSroX %4, %1, 0, 1 :: (load (s32) from %ir.scevgep)
+ %13:fpr32 = nofpexcept FADDSrr killed %11, %12, implicit $fpcr
+ STRSroX killed %13, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep11)
+ %14:gpr64common = SUBXri %1, 1, 0
+ %2:gpr64all = COPY %14
+ %15:gpr32 = COPY %14.sub_32
+ CBZW %16, %bb.3
+ B %bb.2
+
+ bb.3.for.end:
+ RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-unpipeline-insts1.mir b/llvm/test/CodeGen/AArch64/sms-unpipeline-insts1.mir
new file mode 100644
index 0000000000000..5973a44308253
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-unpipeline-insts1.mir
@@ -0,0 +1,87 @@
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -mcpu=neoverse-n1 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -debug-only=pipeliner 2>&1 | FileCheck %s
+
+# Check that instructions referencing NZCV are not pipelined
+
+# CHECK: SU([[SU0:[0-9]+]]): nofpexcept FCMPSri {{.*}}, implicit-def $nzcv, implicit $fpcr
+# CHECK: SU([[SU1:[0-9]+]]): {{.*}} = FCSELSrrr {{.*}}, {{.*}}, 1, implicit $nzcv
+# CHECK: Do not pipeline SU([[SU0:[0-9]+]])
+# CHECK: Do not pipeline SU([[SU1:[0-9]+]])
+
+--- |
+ define dso_local void @KERNEL(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
+ entry:
+ %cmp19 = icmp sgt i32 %n, 0
+ br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+ for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext nneg i32 %n to i64
+ br label %for.body
+
+ for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+ for.body: ; preds = %for.body.preheader, %for.body
+ %lsr.iv24 = phi i64 [ %wide.trip.count, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+ %lsr.iv22 = phi ptr [ %b, %for.body.preheader ], [ %scevgep23, %for.body ]
+ %lsr.iv = phi ptr [ %a, %for.body.preheader ], [ %scevgep, %for.body ]
+ %0 = load float, ptr %lsr.iv22, align 4
+ %tobool = fcmp une float %0, 0.000000e+00
+ %. = select i1 %tobool, float 1.000000e+00, float 2.000000e+00
+ %add = fadd float %0, %.
+ store float %add, ptr %lsr.iv, align 4
+ %scevgep = getelementptr i8, ptr %lsr.iv, i64 4
+ %scevgep23 = getelementptr i8, ptr %lsr.iv22, i64 4
+ %lsr.iv.next = add nsw i64 %lsr.iv24, -1
+ %exitcond.not = icmp eq i64 %lsr.iv.next, 0
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+ }
+
+...
+---
+name: KERNEL
+tracksRegLiveness: true
+liveins:
+ - { reg: '$x0', virtual-reg: '%7' }
+ - { reg: '$x1', virtual-reg: '%8' }
+ - { reg: '$w2', virtual-reg: '%9' }
+body: |
+ bb.0.entry:
+ successors: %bb.1(0x50000000), %bb.2(0x30000000)
+ liveins: $x0, $x1, $w2
+
+ %9:gpr32common = COPY $w2
+ %8:gpr64 = COPY $x1
+ %7:gpr64 = COPY $x0
+ dead $wzr = SUBSWri %9, 1, 0, implicit-def $nzcv
+ Bcc 11, %bb.2, implicit $nzcv
+ B %bb.1
+
+ bb.1.for.body.preheader:
+ %11:gpr32 = ORRWrs $wzr, %9, 0
+ %0:gpr64all = SUBREG_TO_REG 0, killed %11, %subreg.sub_32
+ %14:fpr32 = FMOVSi 0
+ %15:fpr32 = FMOVSi 112
+ B %bb.3
+
+ bb.2.for.cond.cleanup:
+ RET_ReallyLR
+
+ bb.3.for.body:
+ successors: %bb.2(0x04000000), %bb.3(0x7c000000)
+
+ %1:gpr64sp = PHI %0, %bb.1, %6, %bb.3
+ %2:gpr64sp = PHI %8, %bb.1, %5, %bb.3
+ %3:gpr64sp = PHI %7, %bb.1, %4, %bb.3
+ early-clobber %12:gpr64sp, %13:fpr32 = LDRSpost %2, 4 :: (load (s32) from %ir.lsr.iv22)
+ nofpexcept FCMPSri %13, implicit-def $nzcv, implicit $fpcr
+ %16:fpr32 = FCSELSrrr %15, %14, 1, implicit $nzcv
+ %17:fpr32 = nofpexcept FADDSrr %13, killed %16, implicit $fpcr
+ early-clobber %18:gpr64sp = STRSpost killed %17, %3, 4 :: (store (s32) into %ir.lsr.iv)
+ %4:gpr64all = COPY %18
+ %5:gpr64all = COPY %12
+ %19:gpr64 = nsw SUBSXri %1, 1, 0, implicit-def $nzcv
+ %6:gpr64all = COPY %19
+ Bcc 0, %bb.2, implicit $nzcv
+ B %bb.3
+
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-unpipeline-insts2.mir b/llvm/test/CodeGen/AArch64/sms-unpipeline-insts2.mir
new file mode 100644
index 0000000000000..fdecbffdd4490
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-unpipeline-insts2.mir
@@ -0,0 +1,80 @@
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-enable-copytophi=0 -debug-only=pipeliner 2>&1 | FileCheck %s
+
+# An acceptable loop by pipeliner TBB == LoopBB, FBB == ExitBB, Compare and branch
+# CHECK: SU([[SU0:[0-9]+]]): [[V0:%[0-9]+]]:gpr64common = SUBXri [[V1:%[0-9]+]]:gpr64common, 1, 0
+# CHECK: Do not pipeline SU([[SU0:[0-9]+]])
+
+--- |
+ define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 {
+ entry:
+ %or.cond = icmp ult i32 %n, 2
+ br i1 %or.cond, label %for.end, label %for.body.preheader
+
+ for.body.preheader: ; preds = %entry
+ %i.07 = add i32 %n, -1
+ %0 = sext i32 %i.07 to i64
+ br label %for.body
+
+ for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %1 = shl nsw i64 %indvars.iv, 2
+ %scevgep = getelementptr i8, ptr %b, i64 %1
+ %2 = load float, ptr %scevgep, align 4
+ %add = fadd float %2, 1.000000e+00
+ %3 = shl nsw i64 %indvars.iv, 2
+ %scevgep11 = getelementptr i8, ptr %a, i64 %3
+ store float %add, ptr %scevgep11, align 4
+ %indvars.iv.next = add nsw i64 %indvars.iv, -1
+ %4 = add i64 %indvars.iv, -1
+ %5 = and i64 %4, 4294967295
+ %tobool.not = icmp eq i64 %5, 0
+ br i1 %tobool.not, label %for.end, label %for.body
+
+ for.end: ; preds = %for.body, %entry
+ ret void
+ }
+
+...
+---
+name: func
+tracksRegLiveness: true
+liveins:
+ - { reg: '$x0', virtual-reg: '%3' }
+ - { reg: '$x1', virtual-reg: '%4' }
+ - { reg: '$w2', virtual-reg: '%5' }
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $w2
+
+ %5:gpr32common = COPY $w2
+ %4:gpr64common = COPY $x1
+ %3:gpr64common = COPY $x0
+ dead $wzr = SUBSWri %5, 2, 0, implicit-def $nzcv
+ Bcc 3, %bb.3, implicit $nzcv
+ B %bb.1
+
+ bb.1.for.body.preheader:
+ %7:gpr32common = SUBWri %5, 1, 0
+ %9:gpr64all = IMPLICIT_DEF
+ %8:gpr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32
+ %10:gpr64 = SBFMXri killed %8, 0, 31
+ %0:gpr64all = COPY %10
+ %12:fpr32 = FMOVSi 112
+
+ bb.2.for.body:
+ successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+
+ %1:gpr64common = PHI %0, %bb.1, %2, %bb.2
+ %11:fpr32 = LDRSroX %4, %1, 0, 1 :: (load (s32) from %ir.scevgep)
+ %13:fpr32 = nofpexcept FADDSrr killed %11, %12, implicit $fpcr
+ STRSroX killed %13, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep11)
+ %14:gpr64common = SUBXri %1, 1, 0
+ %2:gpr64all = COPY %14
+ %15:gpr32 = COPY %14.sub_32
+ CBNZW killed %15, %bb.2
+ B %bb.3
+
+ bb.3.for.end:
+ RET_ReallyLR
+
+...
More information about the libc-commits
mailing list