[llvm] [Hexagon] Add MachineUnroller pass (PR #177197)
Fateme Hosseini via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 21 09:24:16 PST 2026
https://github.com/fhossein-quic updated https://github.com/llvm/llvm-project/pull/177197
>From 6c79adca724be816d5e5061a9aadf4cd5369c03e Mon Sep 17 00:00:00 2001
From: jverma <jverma at qti.qualcomm.com>
Date: Thu, 15 Jan 2026 09:36:12 -0800
Subject: [PATCH] [Hexagon] Add MachineUnroller pass
This patch adds a framework for the
target-independent machine unroller pass which
works at the MI level. Only small innermost loops
with runtime trip counts and a single basic block
are handled. The loops are unrolled only if it is
determined to improve resource usage for the loop.
This pass is enabled for Hexagon only. To enable
it for other targets, a target-specific
MachineUnroller must be implemented that inherits
from the MachineUnroller class. The target also
needs to implement the createMachineUnroller
function which creates and returns a pointer to the
target's MachineUnroller object. Finally, the pass
needs to be called from the target's TargetMachine
implementation.
For Hexagon, this pass is called before the
software pipeliner at -O3 optimization level. Since
unrolling often improves ILP (sometimes with the
help of other optimizations), when combined with
software pipelining, it can help reduce the number
of cycles per iteration and thus improve
performance.
The pass provides heuristics based on
resource-constrained minimum initiation interval
(ResMII), detects and handles self-dependencies,
preserves debug information during unrolling, and
includes optimization remarks for analysis. It can
be controlled via the -enable-machine-unroller
command-line option.
Patch By:
Fateme Hosseini
Co-authored-by: Sumanth Gundapaneni <sgundapa at qti.qualcomm.com>
---
llvm/include/llvm/CodeGen/MachineUnroller.h | 125 +++
llvm/include/llvm/CodeGen/Passes.h | 3 +
llvm/include/llvm/CodeGen/TargetPassConfig.h | 7 +
llvm/include/llvm/InitializePasses.h | 1 +
llvm/lib/CodeGen/CMakeLists.txt | 2 +
llvm/lib/CodeGen/CodeGen.cpp | 1 +
llvm/lib/CodeGen/MachineUnroller.cpp | 721 ++++++++++++++++++
llvm/lib/CodeGen/MachineUnrollerPass.cpp | 596 +++++++++++++++
llvm/lib/Target/Hexagon/CMakeLists.txt | 1 +
.../Target/Hexagon/HexagonMachineUnroller.cpp | 148 ++++
.../Target/Hexagon/HexagonMachineUnroller.h | 46 ++
.../Target/Hexagon/HexagonTargetMachine.cpp | 8 +
.../CodeGen/Hexagon/dbg-instr-machunroll.ll | 141 ++++
.../CodeGen/Hexagon/dbg-label-machunroll.ll | 61 ++
.../Hexagon/machine-unroller-remarks.ll | 48 ++
.../CodeGen/Hexagon/miunroll-adjust-resmii.ll | 48 ++
.../Hexagon/miunroll-memoperand-size.mir | 37 +
.../Hexagon/miunroll-optimize-memrefs1.ll | 123 +++
.../Hexagon/miunroll-selfdependency.ll | 25 +
.../Hexagon/miunroll-selfdependency2.ll | 54 ++
.../Hexagon/miunroll-update-memoperands.ll | 56 ++
llvm/test/CodeGen/Hexagon/miunroll-valign.ll | 82 ++
llvm/test/CodeGen/Hexagon/miunroll.ll | 52 ++
23 files changed, 2386 insertions(+)
create mode 100644 llvm/include/llvm/CodeGen/MachineUnroller.h
create mode 100644 llvm/lib/CodeGen/MachineUnroller.cpp
create mode 100644 llvm/lib/CodeGen/MachineUnrollerPass.cpp
create mode 100644 llvm/lib/Target/Hexagon/HexagonMachineUnroller.cpp
create mode 100644 llvm/lib/Target/Hexagon/HexagonMachineUnroller.h
create mode 100644 llvm/test/CodeGen/Hexagon/dbg-instr-machunroll.ll
create mode 100644 llvm/test/CodeGen/Hexagon/dbg-label-machunroll.ll
create mode 100644 llvm/test/CodeGen/Hexagon/machine-unroller-remarks.ll
create mode 100644 llvm/test/CodeGen/Hexagon/miunroll-adjust-resmii.ll
create mode 100644 llvm/test/CodeGen/Hexagon/miunroll-memoperand-size.mir
create mode 100644 llvm/test/CodeGen/Hexagon/miunroll-optimize-memrefs1.ll
create mode 100644 llvm/test/CodeGen/Hexagon/miunroll-selfdependency.ll
create mode 100644 llvm/test/CodeGen/Hexagon/miunroll-selfdependency2.ll
create mode 100644 llvm/test/CodeGen/Hexagon/miunroll-update-memoperands.ll
create mode 100644 llvm/test/CodeGen/Hexagon/miunroll-valign.ll
create mode 100644 llvm/test/CodeGen/Hexagon/miunroll.ll
diff --git a/llvm/include/llvm/CodeGen/MachineUnroller.h b/llvm/include/llvm/CodeGen/MachineUnroller.h
new file mode 100644
index 0000000000000..03d9ab34f678d
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/MachineUnroller.h
@@ -0,0 +1,125 @@
+//===- MachineUnroller.h - Machine loop unrolling utilities -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines loop unrolling utilities used at the machine instruction
+// (MI) level.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MACHINEUNROLLER_H
+#define LLVM_CODEGEN_MACHINEUNROLLER_H
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+namespace llvm {
+
+// This is a utility for unrolling loops at MI level.
+// It only unroll loops with the run-time trip count and
+// with a single basic block.
+//
+// After unrolling, the loop structure will be the following:
+//
+// Original LoopPreheader
+// Unrolled LoopPreheader
+// Unrolled Loop
+// Unrolled LoopExit
+// Remainder LoopPreheader
+// Remainder Loop
+// Remainder LoopExit
+// Original LoopExit
+
+struct MachineUnrollerContext {
+ MachineFunction *MF = nullptr;
+ MachineLoopInfo *MLI = nullptr;
+ const TargetInstrInfo *TII = nullptr;
+ MachineUnrollerContext() {}
+ MachineUnrollerContext(MachineFunction *mf, MachineLoopInfo *mli,
+ const TargetInstrInfo *tii)
+ : MF(mf), MLI(mli), TII(tii) {}
+};
+
+class MachineUnroller {
+protected:
+ MachineFunction *MF = nullptr;
+ MachineLoopInfo *MLI = nullptr;
+ const TargetInstrInfo *TII = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+ MachineLoop *L;
+ MachineBasicBlock *OrigHeader;
+ MachineBasicBlock *OrigPreheader;
+ MachineBasicBlock *ULPreheader;
+ MachineBasicBlock *ULHeader;
+ MachineBasicBlock *ULExit;
+ MachineBasicBlock *RLPreheader;
+ MachineBasicBlock *RLHeader;
+ MachineBasicBlock *RLExit;
+ MachineBasicBlock *OrigLoopExit;
+ unsigned UnrollFactor;
+ unsigned LC;
+ SmallVector<MachineBasicBlock *, 4> LoopBBs;
+ SmallVector<unsigned, 4> ExitBBLiveIns;
+
+ typedef SmallDenseMap<MachineBasicBlock *, DenseMap<unsigned, unsigned>, 4>
+ ValueMapTy;
+ ValueMapTy VRMap;
+ DenseMap<unsigned, unsigned> ULPhiVRMap;
+ void createUnrolledLoopStruct();
+ void updateInstruction(MachineInstr *NewMI, bool FirstIter,
+ ValueMapTy &OldVRMap);
+ void generateUnrolledLoop();
+ unsigned getMappedRegORCreate(unsigned Reg, MachineBasicBlock *BB);
+ void generateNewPhis(MachineBasicBlock *BB, MachineBasicBlock *BB1,
+ MachineBasicBlock *BB2);
+ void generatePhisForRLExit();
+ void generatePhisForULExit();
+ void getExitBBLiveIns();
+ void addBBIntoVRMap(MachineBasicBlock *BB);
+ void fixBranchesAndLoopCount(unsigned ULCount, unsigned RLCount);
+ unsigned getLatestInstance(unsigned reg, MachineBasicBlock *BB,
+ ValueMapTy &VRMap);
+ void init(MachineLoop *loop, unsigned unrollFactor);
+ bool canUnroll();
+ void preprocessPhiNodes(MachineBasicBlock &B);
+
+public:
+ MachineUnroller(MachineUnrollerContext *C)
+ : MF(C->MF), MLI(C->MLI), TII(C->TII) {
+ MRI = &MF->getRegInfo();
+ }
+
+ virtual ~MachineUnroller() = default;
+
+ bool unroll(MachineLoop *loop, unsigned unrollFactor);
+
+ virtual unsigned getLoopCount(MachineBasicBlock &LoopBB) const = 0;
+
+ /// Add instruction to compute trip count for the unrolled loop.
+ virtual unsigned addUnrolledLoopCountMI(MachineBasicBlock &MBB, unsigned LC,
+ unsigned UnrollFactor) const = 0;
+
+ /// Add instruction to compute remainder trip count for the unrolled loop.
+ virtual unsigned addRemLoopCountMI(MachineBasicBlock &MBB, unsigned LC,
+ unsigned UnrollFactor) const = 0;
+
+ virtual void changeLoopCount(MachineBasicBlock &BB,
+ MachineBasicBlock &Preheader,
+ MachineBasicBlock &Header,
+ MachineBasicBlock &LoopBB, unsigned LC,
+ SmallVectorImpl<MachineOperand> &Cond) const = 0;
+
+ bool computeDelta(MachineInstr &MI, unsigned &Delta) const;
+ void updateMemOperands(MachineInstr *NewMI, MachineInstr *OldMI,
+ unsigned iter) const;
+ virtual void optimize(MachineBasicBlock &BB) const {};
+};
+} // namespace llvm
+#endif
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 2717110e1b3e7..bf880d00da726 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -533,6 +533,9 @@ LLVM_ABI FunctionPass *createRegUsageInfoPropPass();
/// This pass performs software pipelining on machine instructions.
LLVM_ABI extern char &MachinePipelinerID;
+/// This pass performs loop unrolling at the machine instruction (MI) level.
+LLVM_ABI extern char &MachineUnrollerPassID;
+
/// This pass frees the memory occupied by the MachineFunction.
LLVM_ABI FunctionPass *createFreeMachineFunctionPass();
diff --git a/llvm/include/llvm/CodeGen/TargetPassConfig.h b/llvm/include/llvm/CodeGen/TargetPassConfig.h
index 5e0e641a981f9..3aaa421a3bcee 100644
--- a/llvm/include/llvm/CodeGen/TargetPassConfig.h
+++ b/llvm/include/llvm/CodeGen/TargetPassConfig.h
@@ -26,6 +26,8 @@ class TargetMachine;
class PassConfigImpl;
class CSEConfigBase;
class PassInstrumentationCallbacks;
+class MachineUnroller;
+struct MachineUnrollerContext;
// The old pass manager infrastructure is hidden in a legacy namespace now.
namespace legacy {
@@ -299,6 +301,11 @@ class LLVM_ABI TargetPassConfig : public ImmutablePass {
/// Fully developed targets will not generally override this.
virtual void addMachinePasses();
+ virtual MachineUnroller *
+ createMachineUnroller(MachineUnrollerContext *C) const {
+ return nullptr;
+ }
+
/// printAndVerify - Add a pass to dump then verify the machine function, if
/// those steps are enabled.
void printAndVerify(const std::string &Banner);
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index e9e3ca3cc93a0..8cf4af720092a 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -212,6 +212,7 @@ LLVM_ABI void initializeMachineLateInstrsCleanupLegacyPass(PassRegistry &);
LLVM_ABI void initializeMachineLICMPass(PassRegistry &);
LLVM_ABI void initializeMachineLoopInfoWrapperPassPass(PassRegistry &);
LLVM_ABI void initializeMachineModuleInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineUnrollerPassPass(PassRegistry &);
LLVM_ABI void
initializeMachineOptimizationRemarkEmitterPassPass(PassRegistry &);
LLVM_ABI void initializeMachineOutlinerPass(PassRegistry &);
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index f26b2cb6fddf5..323447773b58d 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -143,6 +143,8 @@ add_llvm_component_library(LLVMCodeGen
MachineOutliner.cpp
MachinePassManager.cpp
MachinePipeliner.cpp
+ MachineUnroller.cpp
+ MachineUnrollerPass.cpp
MachinePostDominators.cpp
MachineRegionInfo.cpp
MachineRegisterInfo.cpp
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 3550eea13979a..627e413d5263d 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -86,6 +86,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
initializeMachineLICMPass(Registry);
initializeMachineLoopInfoWrapperPassPass(Registry);
initializeMachineModuleInfoWrapperPassPass(Registry);
+ initializeMachineUnrollerPassPass(Registry);
initializeMachineOptimizationRemarkEmitterPassPass(Registry);
initializeMachineOutlinerPass(Registry);
initializeMachinePipelinerPass(Registry);
diff --git a/llvm/lib/CodeGen/MachineUnroller.cpp b/llvm/lib/CodeGen/MachineUnroller.cpp
new file mode 100644
index 0000000000000..8fb4a96f9ae52
--- /dev/null
+++ b/llvm/lib/CodeGen/MachineUnroller.cpp
@@ -0,0 +1,721 @@
+//===- MachineUnroller.cpp - Machine loop unrolling utilities -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file implements loop unrolling functionality at the machine instruction
+// (MI) level.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineUnroller.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-unroller"
+
+// This is a utility for unrolling loops at MI level.
+// It only unroll loops with the run-time trip count and
+// with a single basic block.
+//
+// After unrolling, the loop structure will be the following:
+//
+// Original LoopPreheader
+// Unrolled LoopPreheader
+// Unrolled Loop
+// Unrolled LoopExit
+// Remainder LoopPreheader
+// Remainder Loop
+// Remainder LoopExit
+// Original LoopExit
+
+void MachineUnroller::init(MachineLoop *loop, unsigned unrollFactor) {
+ L = loop;
+ UnrollFactor = unrollFactor;
+ OrigHeader = L->getHeader();
+ OrigPreheader = L->getLoopPreheader();
+ OrigLoopExit = L->getExitBlock();
+ LoopBBs.clear();
+ ExitBBLiveIns.clear();
+}
+
+bool MachineUnroller::canUnroll() {
+ // Only loops with a single basic block are handled. Also, the loop must
+ // be analyzable using analyzeBranch. It's the responsibility of the caller of
+ // this function to make sure that these requirement are met.
+ assert(L->getNumBlocks() == 1 && "Only loops with single basic block can be"
+ "unrolled!!");
+ if (!isPowerOf2_32(UnrollFactor)) {
+ LLVM_DEBUG(dbgs() << "Can't Unroll!! UnrollFactor must be a power of 2.");
+ return false;
+ }
+
+ if (!TII->analyzeLoopForPipelining(L->getTopBlock()))
+ return false;
+
+ // Get loop trip count. Compile-time trip count is not handled.
+ LC = getLoopCount(*L->getTopBlock());
+ return Register::isVirtualRegister(LC);
+}
+
+/// Create empty basic blocks for the unrolled/remainder loops and
+/// add them to the CFG. Some the BBs from the original loop are reused
+/// and their successors/predecessors are changed as needed.
+void MachineUnroller::createUnrolledLoopStruct() {
+ // Create basic blocks for the Unrolled Loop.
+ ULPreheader = MF->CreateMachineBasicBlock();
+ MF->insert(OrigHeader->getIterator(), ULPreheader);
+
+ ULHeader = MF->CreateMachineBasicBlock();
+ ULHeader->setAlignment(OrigHeader->getAlignment());
+ MF->insert(OrigHeader->getIterator(), ULHeader);
+
+ ULPreheader->addSuccessor(ULHeader);
+ ULHeader->addSuccessor(ULHeader);
+ OrigPreheader->replaceSuccessor(OrigHeader, ULPreheader);
+
+ // Create basic blocks for the Remainder Loop. The original loop header
+ // is used as the remainder loop header. The loop trip count is adjusted
+ // later to the appropriate value.
+ RLHeader = OrigHeader;
+
+ ULExit = MF->CreateMachineBasicBlock();
+ MF->insert(RLHeader->getIterator(), ULExit);
+
+ RLPreheader = MF->CreateMachineBasicBlock();
+ MF->insert(RLHeader->getIterator(), RLPreheader);
+
+ RLExit = MF->CreateMachineBasicBlock();
+ MF->insert(++RLHeader->getIterator(), RLExit);
+
+ ULExit->addSuccessor(RLPreheader);
+ RLPreheader->addSuccessor(RLHeader);
+
+ ULHeader->addSuccessor(ULExit);
+ OrigPreheader->addSuccessor(ULExit);
+ ULExit->addSuccessor(RLExit);
+ RLExit->addSuccessor(OrigLoopExit);
+ RLHeader->replaceSuccessor(OrigLoopExit, RLExit);
+
+ LoopBBs.push_back(ULPreheader);
+ LoopBBs.push_back(ULHeader);
+ LoopBBs.push_back(ULExit);
+ LoopBBs.push_back(RLPreheader);
+ LoopBBs.push_back(RLHeader);
+ LoopBBs.push_back(RLExit);
+
+ // Update the Phis in RLHeader (same as OrigHeader) and
+ // OrigLoopExit to use the new predecessors.
+ for (MachineBasicBlock::iterator I = RLHeader->instr_begin(),
+ E = RLHeader->getFirstNonPHI();
+ I != E; ++I) {
+ MachineInstr *Phi = &*I;
+ for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2)
+ if (Phi->getOperand(i + 1).getMBB() != RLHeader)
+ Phi->getOperand(i + 1).setMBB(RLPreheader);
+ }
+
+ for (MachineBasicBlock::iterator I = OrigLoopExit->instr_begin(),
+ E = OrigLoopExit->getFirstNonPHI();
+ I != E; ++I) {
+ MachineInstr *Phi = &*I;
+ for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2)
+ if (Phi->getOperand(i + 1).getMBB() == RLHeader)
+ Phi->getOperand(i + 1).setMBB(RLExit);
+ }
+}
+
+/// Return the Phi Operand that comes from outside the loop.
+static MachineOperand &getInitPhiOp(MachineInstr *Phi,
+ MachineBasicBlock *LoopBB) {
+ for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2)
+ if (Phi->getOperand(i + 1).getMBB() != LoopBB)
+ return Phi->getOperand(i);
+ llvm_unreachable("Unexpected Phi structure.");
+}
+
+/// Return the Phi register value that comes from outside the loop.
+static unsigned getInitPhiReg(MachineInstr *Phi, MachineBasicBlock *LoopBB) {
+ for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2)
+ if (Phi->getOperand(i + 1).getMBB() != LoopBB)
+ return Phi->getOperand(i).getReg();
+ llvm_unreachable("Unexpected Phi structure.");
+}
+
+/// Return the Phi Operand that comes from the loop block.
+static MachineOperand &getLoopPhiOp(MachineInstr *Phi,
+ MachineBasicBlock *LoopBB) {
+ for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2)
+ if (Phi->getOperand(i + 1).getMBB() == LoopBB)
+ return Phi->getOperand(i);
+ llvm_unreachable("Unexpected Phi structure.");
+}
+
+/// Return the Phi register value that comes from the loop block.
+static unsigned getLoopPhiReg(MachineInstr *Phi, MachineBasicBlock *LoopBB) {
+ for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2)
+ if (Phi->getOperand(i + 1).getMBB() == LoopBB)
+ return Phi->getOperand(i).getReg();
+ llvm_unreachable("Unexpected Phi structure.");
+}
+
+/// Return the basic block corresponding to the Phi register value.
+static MachineBasicBlock *getPhiRegBB(MachineInstr *Phi, unsigned Reg) {
+ for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2)
+ if (Phi->getOperand(i).getReg() == Reg)
+ return Phi->getOperand(i + 1).getMBB();
+ return 0;
+}
+
+/// Replace all uses of FromReg that appear within the specified
+/// basic block with ToReg.
+static void replaceRegUses(unsigned FromReg, unsigned ToReg,
+ MachineBasicBlock *MBB, MachineRegisterInfo &MRI) {
+ for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
+ E = MRI.use_end();
+ I != E;) {
+ MachineOperand &O = *I;
+ ++I;
+ MachineInstr *UseMI = O.getParent();
+ if (UseMI->isPHI() && getPhiRegBB(UseMI, FromReg) != MBB)
+ continue; // Don't change the register name
+
+ if (UseMI->getParent() == MBB)
+ O.setReg(ToReg);
+ }
+}
+
+/// Clone the Phi instruction and set all the operands appropriately.
+/// This function assumes the instruction is a Phi.
+static MachineInstr *clonePHI(MachineBasicBlock *BB, MachineBasicBlock *BB1,
+ MachineBasicBlock *OrigBB, MachineInstr *Phi) {
+ MachineFunction *MF = OrigBB->getParent();
+ unsigned InitVal = getInitPhiReg(Phi, OrigBB);
+ unsigned LoopVal = getLoopPhiReg(Phi, OrigBB);
+ MachineInstr *NewMI = MF->CloneMachineInstr(Phi);
+ NewMI->getOperand(1).setReg(InitVal);
+ NewMI->getOperand(2).setMBB(BB1);
+ NewMI->getOperand(3).setReg(LoopVal);
+ NewMI->getOperand(4).setMBB(BB);
+ return NewMI;
+}
+
+static bool isBlockOutsideLoop(SmallVector<MachineBasicBlock *, 4> &LoopBBs,
+ MachineBasicBlock *MBB) {
+ for (auto TBB : LoopBBs)
+ if (TBB == MBB)
+ return false;
+ return true;
+}
+
+static void
+replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
+ MachineRegisterInfo &MRI,
+ SmallVector<MachineBasicBlock *, 4> &LoopBBs) {
+ MachineInstr *DefMI = MRI.getVRegDef(ToReg);
+ for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
+ E = MRI.use_end();
+ I != E;) {
+ MachineOperand &O = *I;
+ ++I;
+ MachineBasicBlock *UseBB = O.getParent()->getParent();
+ if (isBlockOutsideLoop(LoopBBs, UseBB) && DefMI != O.getParent())
+ O.setReg(ToReg);
+ }
+}
+
+/// Return the register name for the latest instance of 'reg' as found
+/// in the VRMap. FYI, During unrolling, different instances of 'reg'
+/// (one from each iteration) are given a new name which is tracked
+/// using VRMap.
+unsigned MachineUnroller::getLatestInstance(unsigned reg, MachineBasicBlock *BB,
+ ValueMapTy &VRMap) {
+ auto BBI = VRMap.find(BB);
+ if (BBI == VRMap.end())
+ return reg;
+
+ auto &BBMap = BBI->second;
+ unsigned LatestReg = reg;
+ while (true) {
+ auto It = BBMap.find(LatestReg);
+ if (It == BBMap.end() || LatestReg == It->second)
+ return LatestReg;
+ LatestReg = It->second;
+ }
+}
+
+/// Update the machine instruction with new virtual registers. This
+/// function is only used to update the instructions in the unrolled
+/// loop header. It may change the defintions and/or uses.
+void MachineUnroller::updateInstruction(MachineInstr *NewMI, bool FirstIter,
+ ValueMapTy &OldVRMap) {
+ MachineBasicBlock *BB = NewMI->getParent();
+ DenseMap<unsigned, unsigned> NewVRMap;
+ DenseMap<unsigned, unsigned> &BBVRMap = VRMap[BB];
+ for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = NewMI->getOperand(i);
+ if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
+ continue;
+ unsigned reg = MO.getReg();
+ if (MO.isDef()) {
+ // Create a new virtual register for the definition.
+ const TargetRegisterClass *RC = MRI->getRegClass(reg);
+ unsigned NewReg = MRI->createVirtualRegister(RC);
+ MO.setReg(NewReg);
+ NewVRMap[reg] = NewReg;
+ if (NewMI->isPHI())
+ ULPhiVRMap[reg] = NewReg;
+ } else if (MO.isUse()) {
+ MachineInstr *DefMI = MRI->getVRegDef(reg);
+ if (DefMI && DefMI->isPHI()) {
+ if (NewMI->isPHI() && FirstIter)
+ // Don't change the 'use' yet based on the new def reg. It will be
+ // changed later to use the the last instance of the value reaching
+ // from the loop after it has been unrolled.
+ continue;
+ else if (!FirstIter) {
+ // Get mapped reg:
+ // 1) If 'use' is a PHI, use the mapped reg from the previous
+ // iteration.
+ // 2) If 'use' is a non-PHI, use the mapped reg from the current
+ // iteration.
+ unsigned LatestReg = NewMI->isPHI()
+ ? getLatestInstance(reg, BB, OldVRMap)
+ : getLatestInstance(reg, BB, VRMap);
+ MO.setReg(LatestReg);
+ continue;
+ }
+ }
+ if (BBVRMap.count(reg)) {
+ unsigned MappedReg = BBVRMap[reg];
+ if (MRI->getVRegDef(MappedReg) != NewMI)
+ MO.setReg(MappedReg);
+ }
+ }
+ }
+
+ for (auto Val : NewVRMap)
+ VRMap[BB][Val.first] = Val.second;
+}
+
+/// Return true if we can compute the amount the instruction changes
+/// during each iteration. Set Delta to the amount of the change.
+bool MachineUnroller::computeDelta(MachineInstr &MI, unsigned &Delta) const {
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+ const MachineOperand *BaseOp;
+ int64_t Offset;
+ bool OffsetIsScalable;
+ if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
+ return false;
+
+ if (OffsetIsScalable)
+ return false;
+
+ if (!BaseOp->isReg())
+ return false;
+
+ // Check if there is a Phi. If so, get the definition in the loop.
+ unsigned BaseReg = BaseOp->getReg();
+ MachineInstr *BaseDef = MRI->getVRegDef(BaseReg);
+ if (BaseDef && BaseDef->isPHI()) {
+ if (BaseDef->getParent() != MI.getParent())
+ return false;
+ BaseReg = getLoopPhiReg(BaseDef, MI.getParent());
+ BaseDef = MRI->getVRegDef(BaseReg);
+ }
+ if (!BaseDef)
+ return false;
+
+ int D = 0;
+ if (!TII->getIncrementValue(*BaseDef, D))
+ return false;
+ // Conservative reaction to negative offsets
+ if (D < 0)
+ return false;
+ Delta = D;
+ return true;
+}
+
+/// Update the memory operand with a new offset when the unroller
+/// generates a new copy of the instruction that refers to a
+/// different memory location.
+void MachineUnroller::updateMemOperands(MachineInstr *NewMI,
+ MachineInstr *OldMI,
+ unsigned iter) const {
+ if (iter == 0)
+ return;
+ // If the instruction has memory operands, then adjust the offset
+ // when the instruction appears in different iterations.
+ unsigned NumRefs = NewMI->memoperands_end() - NewMI->memoperands_begin();
+ if (NumRefs == 0)
+ return;
+ SmallVector<MachineMemOperand *, 2> NewMMOs;
+ for (MachineMemOperand *MMO : NewMI->memoperands()) {
+ if (MMO->isVolatile() || (MMO->isInvariant() && MMO->isDereferenceable()) ||
+ (!MMO->getValue())) {
+ NewMMOs.push_back(MMO);
+ continue;
+ }
+ unsigned Delta;
+ LLT valTy = MMO->getType();
+ if (computeDelta(*OldMI, Delta)) {
+ int64_t AdjOffset = Delta * iter;
+ NewMMOs.push_back(MF->getMachineMemOperand(MMO, AdjOffset, valTy));
+ } else
+ NewMMOs.push_back(MF->getMachineMemOperand(MMO, 0, LLT()));
+ }
+ NewMI->setMemRefs(*MF, NewMMOs);
+}
+
+/// Adjust offset value for the instructions with memory operands when their
+/// copies are generated after first iteration. By adjusting the offset and
+/// using the right base register, we can avoid uncessary 'add' instructions
+/// that are used to increment the offset for each iteration.
+
+/// Generate instructions for the unrolled loop header.
+void MachineUnroller::generateUnrolledLoop() {
+ for (unsigned iter = 0; iter < UnrollFactor; iter++) {
+ ValueMapTy OldVRMap = VRMap;
+ for (MachineBasicBlock::iterator I = OrigHeader->instr_begin(),
+ E = OrigHeader->getFirstTerminator();
+ I != E; ++I) {
+ MachineInstr *MI = &*I;
+ bool FirstIter = (iter == 0);
+ if (MI->isPHI() && !FirstIter) {
+ // Just create a new dummy register name for the PHI def and map
+ // it to LoopVal reaching from the previous iteration.
+ unsigned OrigReg = MI->getOperand(0).getReg();
+ const TargetRegisterClass *RC = MRI->getRegClass(OrigReg);
+ unsigned NewReg = MRI->createVirtualRegister(RC);
+ VRMap[ULHeader][OrigReg] = NewReg;
+ unsigned LoopVal = getLoopPhiReg(MI, OrigHeader);
+ if (RC == MRI->getRegClass(LoopVal)) {
+ VRMap[ULHeader][NewReg] =
+ getLatestInstance(LoopVal, ULHeader, OldVRMap);
+ continue;
+ } else {
+ unsigned LatestReg = getLatestInstance(LoopVal, ULHeader, OldVRMap);
+ MachineBasicBlock *BB = MI->getParent();
+ MachineBasicBlock::iterator At = BB->getFirstTerminator();
+ const DebugLoc &DL = BB->findDebugLoc(At);
+ MachineInstr *NMI =
+ BuildMI(*BB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
+ .addReg(LatestReg);
+ NMI->removeFromParent();
+ ULHeader->push_back(NMI);
+ VRMap[ULHeader][OrigReg] = NewReg;
+ continue;
+ }
+ }
+ MachineInstr *NewMI =
+ MI->isPHI() ? clonePHI(ULHeader, ULPreheader, OrigHeader, MI)
+ : MF->CloneMachineInstr(MI);
+ ULHeader->push_back(NewMI);
+ updateInstruction(NewMI, iter == 0, OldVRMap);
+ updateMemOperands(NewMI, MI, iter);
+ }
+ }
+
+ // Copy any terminator instructions to the unrolled loop header.
+ for (MachineBasicBlock::iterator I = OrigHeader->getFirstTerminator(),
+ E = OrigHeader->instr_end();
+ I != E; ++I) {
+ MachineInstr *NewMI = MF->CloneMachineInstr(&*I);
+ ULHeader->push_back(NewMI);
+ updateInstruction(NewMI, false, VRMap);
+ }
+
+ // Update PHIs
+ for (MachineBasicBlock::iterator I = ULHeader->instr_begin(),
+ E = ULHeader->getFirstNonPHI();
+ I != E; ++I) {
+ MachineInstr *Phi = &*I;
+ MachineOperand &MO = getLoopPhiOp(Phi, ULHeader);
+ unsigned reg = MO.getReg();
+ MO.setReg(getLatestInstance(reg, ULHeader, VRMap));
+ }
+}
+
+/// Regenerate post-increment load/store instructions. Also, update the offset
+/// value for the load/store instructions that use the same base address as the
+/// newly created post-increment load/store.
+
+/// Generate Phis for the exit block for the unrolled loop.
+void MachineUnroller::generatePhisForULExit() {
+ ValueMapTy OldVRMap = VRMap;
+ for (MachineBasicBlock::iterator I = OrigHeader->instr_begin(),
+ E = OrigHeader->getFirstNonPHI();
+ I != E; ++I) {
+ MachineInstr *Phi = &*I;
+ assert(Phi->isPHI() && "Expecting a Phi.");
+ unsigned DefReg = Phi->getOperand(0).getReg();
+ const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
+ unsigned InitVal = getInitPhiReg(Phi, OrigHeader);
+ unsigned LoopVal = getLoopPhiReg(Phi, OrigHeader);
+
+ assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
+ MachineInstr *LoopInst = MRI->getVRegDef(LoopVal);
+ unsigned PhiOp1 = InitVal;
+ unsigned PhiOp2 = LoopInst->isPHI()
+ ? getLatestInstance(LoopVal, ULHeader, OldVRMap)
+ : getLatestInstance(LoopVal, ULHeader, VRMap);
+
+ unsigned NewReg = MRI->createVirtualRegister(RC);
+ MachineInstrBuilder NewPhi =
+ BuildMI(*ULExit, ULExit->getFirstNonPHI(), DebugLoc(),
+ TII->get(TargetOpcode::PHI), NewReg);
+ NewPhi.addReg(PhiOp1).addMBB(OrigPreheader);
+ NewPhi.addReg(PhiOp2).addMBB(ULHeader);
+ VRMap[ULExit][DefReg] = NewReg;
+ replaceRegUses(DefReg, NewReg, ULExit, *MRI);
+
+ // Update Phi in the original loop header to use 'NewReg'
+ // as the initial value.
+ getInitPhiOp(Phi, OrigHeader).setReg(NewReg);
+ }
+
+ // Generate additional PHIs for the values that are live-in for
+ // the original loop exit block.
+ generateNewPhis(ULExit, OrigPreheader, ULHeader);
+}
+
+unsigned MachineUnroller::getMappedRegORCreate(unsigned Reg,
+ MachineBasicBlock *BB) {
+ const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+ if (VRMap[BB].count(Reg))
+ return getLatestInstance(Reg, BB, VRMap);
+
+ unsigned NewReg = MRI->createVirtualRegister(RC);
+ BuildMI(*BB, BB->getFirstNonPHI(), DebugLoc(),
+ TII->get(TargetOpcode::IMPLICIT_DEF), NewReg);
+ return NewReg;
+}
+
+void MachineUnroller::generateNewPhis(MachineBasicBlock *BB,
+ MachineBasicBlock *BB1,
+ MachineBasicBlock *BB2) {
+ for (auto Reg : ExitBBLiveIns) {
+ unsigned BB1Reg = getMappedRegORCreate(Reg, BB1);
+ unsigned BB2Reg = getMappedRegORCreate(Reg, BB2);
+ const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+ unsigned NewReg = MRI->createVirtualRegister(RC);
+ MachineInstrBuilder NewPhi = BuildMI(*BB, BB->getFirstNonPHI(), DebugLoc(),
+ TII->get(TargetOpcode::PHI), NewReg);
+ NewPhi.addReg(BB1Reg).addMBB(BB1);
+ NewPhi.addReg(BB2Reg).addMBB(BB2);
+ VRMap[BB][Reg] = NewReg;
+ }
+}
+
+/// Generate Phis for the exit block for the remainder loop.
+void MachineUnroller::generatePhisForRLExit() {
+ // Generate PHIs for the values that are live-in for
+ // the original loop exit block.
+ generateNewPhis(RLExit, ULExit, RLHeader);
+
+ for (MachineBasicBlock::iterator I = RLExit->instr_begin(),
+ E = RLExit->getFirstNonPHI();
+ I != E; ++I) {
+ MachineInstr *Phi = &*I;
+ unsigned OrigBBReg = 0;
+ for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2) {
+ if (Phi->getOperand(i + 1).getMBB() == OrigHeader)
+ OrigBBReg = Phi->getOperand(i).getReg();
+ }
+ assert(OrigBBReg != 0 && "Unexpected Phi structure.");
+ unsigned PhiDefReg = Phi->getOperand(0).getReg();
+ replaceRegUsesAfterLoop(OrigBBReg, PhiDefReg, *MRI, LoopBBs);
+ }
+}
+
+void MachineUnroller::getExitBBLiveIns() {
+ for (auto I = OrigHeader->instr_begin(), E = OrigHeader->instr_end(); I != E;
+ ++I) {
+ MachineInstr *MI = &*I;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg() || !MO.isDef() ||
+ !Register::isVirtualRegister(MO.getReg()))
+ continue;
+ unsigned DefReg = MO.getReg();
+ for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DefReg),
+ E = MRI->use_end();
+ I != E;) {
+ MachineOperand &O = *I;
+ ++I;
+ if (O.getParent()->getParent() != OrigHeader) {
+ ExitBBLiveIns.push_back(DefReg);
+ break;
+ }
+ }
+ }
+ }
+}
+
+void MachineUnroller::addBBIntoVRMap(MachineBasicBlock *BB) {
+ for (auto I = BB->instr_begin(), E = BB->instr_end(); I != E; ++I) {
+ MachineInstr *MI = &*I;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
+ continue;
+ if (MO.isDef()) {
+ unsigned DefReg = MO.getReg();
+ VRMap[BB][DefReg] = DefReg;
+ }
+ }
+ }
+}
+
+/// Remove all Phi instructions from BB.
+static void cleanUpPHIs(MachineBasicBlock *BB, MachineRegisterInfo &MRI) {
+ for (MachineBasicBlock::iterator MII = BB->instr_begin(),
+ MIE = BB->getFirstNonPHI();
+ MII != MIE;) {
+ MachineInstr *Phi = &*MII;
+ ++MII;
+ unsigned InitVal = getInitPhiReg(Phi, BB);
+ unsigned PhiDef = Phi->getOperand(0).getReg();
+ for (MachineRegisterInfo::use_iterator I = MRI.use_begin(PhiDef),
+ E = MRI.use_end();
+ I != E;) {
+ MachineOperand &O = *I;
+ ++I;
+ O.setReg(InitVal);
+ }
+ Phi->eraseFromParent();
+ }
+}
+
+/// Fix all the branches for the unrolled and remainder loops. Also, update
+/// the loop count.
+void MachineUnroller::fixBranchesAndLoopCount(unsigned ULCount,
+ unsigned RLCount) {
+ SmallVector<MachineOperand, 4> Cond;
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ bool checkBranch = TII->analyzeBranch(*ULHeader, TBB, FBB, Cond);
+ assert(!checkBranch && "Can't analyze the branch in UnrolledLoop Header");
+ (void)checkBranch;
+
+ TII->removeBranch(*ULHeader);
+ TII->insertBranch(*ULHeader, ULHeader, ULExit, Cond, DebugLoc());
+
+ // Change loop count for the Unrolled loop and fixup branches.
+ SmallVector<MachineOperand, 4> Cond1;
+ changeLoopCount(*OrigPreheader, *ULPreheader, *ULHeader, *L->getTopBlock(),
+ ULCount, Cond1);
+ TII->insertBranch(*OrigPreheader, ULExit, ULPreheader, Cond1, DebugLoc());
+ Cond1.clear();
+ TII->insertBranch(*ULPreheader, ULHeader, nullptr, Cond1, DebugLoc());
+
+ // Copy instructions from the unrolled loop preheader as it may contain
+ // loop setup instructions also needed for the Remainder loop.
+ for (MachineBasicBlock::iterator I = ULPreheader->instr_begin(),
+ E = ULPreheader->getFirstTerminator();
+ I != E; ++I) {
+ MachineInstr *MI = &*I;
+ MachineInstr *NewMI = MF->CloneMachineInstr(MI);
+ ULExit->push_back(NewMI);
+ }
+
+ // Change loop count for the Remainder loop and fixup branches.
+ TII->removeBranch(*RLHeader);
+ TII->insertBranch(*RLHeader, RLHeader, RLExit, Cond, DebugLoc());
+
+ Cond1.clear();
+ changeLoopCount(*ULExit, *RLPreheader, *RLHeader, *L->getTopBlock(), RLCount,
+ Cond1);
+ TII->insertBranch(*ULExit, RLExit, RLPreheader, Cond1, DebugLoc());
+
+ Cond1.clear();
+ TII->insertBranch(*RLPreheader, RLHeader, nullptr, Cond1, DebugLoc());
+ TII->insertBranch(*RLExit, OrigLoopExit, nullptr, Cond1, DebugLoc());
+ if (RLHeader->succ_size() == 1)
+ cleanUpPHIs(RLHeader, *MRI);
+}
+
+void MachineUnroller::preprocessPhiNodes(MachineBasicBlock &B) {
+ for (MachineInstr &PI : make_range(B.begin(), B.getFirstNonPHI())) {
+ MachineOperand &DefOp = PI.getOperand(0);
+ assert(DefOp.getSubReg() == 0);
+ auto *RC = MRI->getRegClass(DefOp.getReg());
+
+ for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
+ MachineOperand &RegOp = PI.getOperand(i);
+ if (RegOp.getSubReg() == 0)
+ continue;
+
+ // If the operand uses a subregister, replace it with a new register
+ // without subregisters, and generate a copy to the new register.
+ unsigned NewReg = MRI->createVirtualRegister(RC);
+ MachineBasicBlock &PredB = *PI.getOperand(i + 1).getMBB();
+ MachineBasicBlock::iterator At = PredB.getFirstTerminator();
+ const DebugLoc &DL = PredB.findDebugLoc(At);
+ BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
+ .addReg(RegOp.getReg(), getRegState(RegOp), RegOp.getSubReg());
+ RegOp.setReg(NewReg);
+ RegOp.setSubReg(0);
+ }
+ }
+}
+
+bool MachineUnroller::unroll(MachineLoop *loop, unsigned unrollFactor) {
+ init(loop, unrollFactor);
+ if (!canUnroll())
+ return false;
+
+ // Remove any subregisters from input to phi nodes.
+ preprocessPhiNodes(*loop->getHeader());
+
+ // Add all the def regs in the loop header in VRMap.
+ addBBIntoVRMap(OrigHeader);
+ getExitBBLiveIns();
+
+ // Create empty basic blocks for the unrolled version of the loop.
+ createUnrolledLoopStruct();
+
+ // Add instructions to compute trip counts for the unrolled and
+ // remainder loops.
+ TII->removeBranch(*OrigPreheader);
+ unsigned ULCount = addUnrolledLoopCountMI(*OrigPreheader, LC, UnrollFactor);
+ unsigned RLCount = addRemLoopCountMI(*OrigPreheader, LC, UnrollFactor);
+
+ // Add instructions to the Unrolled loop header.
+ generateUnrolledLoop();
+
+ // Generate Phis for the unrolled loop exit block and also update
+ // Phis in the remainder loop header to use the correct initial values.
+ generatePhisForULExit();
+
+ // Generate Phis for the remainder loop exit block.
+ generatePhisForRLExit();
+
+ // Optimize unrolled loop header.
+ optimize(*ULHeader);
+
+ // Update branches and adjust loop count.
+ fixBranchesAndLoopCount(ULCount, RLCount);
+
+ SmallVector<MachineBasicBlock *, 4> UpdateBBs = LoopBBs;
+ UpdateBBs.insert(UpdateBBs.begin(), OrigPreheader);
+
+ // Modify existing loop to point to the unrolled loop header.
+ L->removeBlockFromLoop(OrigHeader);
+ L->addBasicBlockToLoop(ULHeader, *MLI);
+ return true;
+}
diff --git a/llvm/lib/CodeGen/MachineUnrollerPass.cpp b/llvm/lib/CodeGen/MachineUnrollerPass.cpp
new file mode 100644
index 0000000000000..db1a72a6cc5c0
--- /dev/null
+++ b/llvm/lib/CodeGen/MachineUnrollerPass.cpp
@@ -0,0 +1,596 @@
+//===- MachineUnrollerPass.cpp - Machine loop unroller pass ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file implements loop unrolling functionality at the machine instruction
+// (MI) level.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineUnroller.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-unroller"
+
+using NV = DiagnosticInfoOptimizationBase::Argument;
+
+/// A command line option to turn MI Loop Unrolling on or off.
+static cl::opt<bool> EnableMIUnroller("enable-machine-unroller", cl::Hidden,
+ cl::init(true), cl::ZeroOrMore,
+ cl::desc("Enable MI Loop Unrolling"));
+
+/// A command line argument to limit size of the unrolled loop.
+static cl::opt<unsigned>
+ MachineUnrollerThres("machine-unroller-threshold",
+ cl::desc("Size limit for the unrolled loop."),
+ cl::Hidden, cl::init(30));
+
+/// A command line option to enable MI Loop Unrolling at -Os.
+static cl::opt<bool>
+ EnableMIUnrollerOptSize("enable-machine-unroller-opt-size",
+ cl::desc("Enable MI Loop Unrolling at Os."),
+ cl::Hidden, cl::init(false));
+
+#ifndef NDEBUG
+static cl::opt<int> UnrollerLimit("machine-unroller-max", cl::Hidden,
+ cl::init(-1));
+#endif
+
+typedef std::set<MachineInstr *> MISet;
+
+namespace {
+class MachineUnrollerPass : public MachineUnrollerContext,
+ public MachineFunctionPass {
+ const TargetPassConfig *PassConfig = nullptr;
+ MachineUnroller *Unroller = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+ MachineOptimizationRemarkEmitter *ORE = nullptr;
+ bool tryToUnrollLoop(MachineLoop &L);
+ bool unrollLoop(MachineLoop *L, unsigned UnrollFactor);
+ bool canUnrollLoop(MachineLoop *L);
+
+public:
+ static char ID;
+ MachineUnrollerPass() : MachineFunctionPass(ID) {
+ initializeMachineUnrollerPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addRequired<MachineLoopInfoWrapperPass>();
+ AU.addRequired<TargetPassConfig>();
+ AU.addRequired<MachineOptimizationRemarkEmitterPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+#ifndef NDEBUG
+ static int NumTries;
+#endif
+};
+
+} // end anonymous namespace
+
+char MachineUnrollerPass::ID = 0;
+#ifndef NDEBUG
+int MachineUnrollerPass::NumTries = 0;
+#endif
+char &llvm::MachineUnrollerPassID = MachineUnrollerPass::ID;
+INITIALIZE_PASS_BEGIN(MachineUnrollerPass, DEBUG_TYPE, "Machine Unrolling",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
+INITIALIZE_PASS_END(MachineUnrollerPass, DEBUG_TYPE, "Machine Unrolling", false,
+ false)
+
+class MachineUnrollerSchedDAG : public ScheduleDAGInstrs {
+ MachineUnrollerPass &Pass;
+ MachineLoop &Loop;
+ MachineOptimizationRemarkEmitter *ORE;
+
+public:
+ MachineUnrollerSchedDAG(MachineUnrollerPass &P, MachineLoop &L,
+ MachineOptimizationRemarkEmitter *ORE)
+ : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), ORE(ORE){};
+ unsigned getUnrollFactor();
+ void schedule() override;
+
+private:
+ unsigned calculateResMII(unsigned UnrollFactor);
+ unsigned adjustResMIIForExtraCopies(unsigned ResMII);
+ bool shouldNotUnroll(MachineLoop &Loop, int MinResMII, MISet &selfDepInstr);
+ SmallVector<DFAPacketizer *, 8> Resources;
+};
+
+// FuncUnitSorter - Comparison operator used to sort instructions by
+// the number of functional unit choices.
+struct FuncUnitSorter {
+ const InstrItineraryData *InstrItins;
+ DenseMap<unsigned, unsigned> Resources;
+
+ FuncUnitSorter(const InstrItineraryData *IID) : InstrItins(IID) {}
+
+ // Compute the number of functional unit alternatives needed
+ // at each stage, and take the minimum value. We prioritize the
+ // instructions by the least number of choices first.
+ unsigned minFuncUnits(const MachineInstr *Inst, unsigned &F) const {
+ unsigned schedClass = Inst->getDesc().getSchedClass();
+ unsigned min = UINT_MAX;
+ for (const InstrStage *IS = InstrItins->beginStage(schedClass),
+ *IE = InstrItins->endStage(schedClass);
+ IS != IE; ++IS) {
+ unsigned funcUnits = IS->getUnits();
+ unsigned numAlternatives = llvm::popcount(funcUnits);
+ if (numAlternatives < min) {
+ min = numAlternatives;
+ F = funcUnits;
+ }
+ }
+ return min;
+ }
+
+ // Compute the critical resources needed by the instruction. This
+ // function records the functional units needed by instructions that
+ // must use only one functional unit. We use this as a tie breaker
+ // for computing the resource MII. The instrutions that require
+ // the same, highly used, functional unit have high priority.
+ void calcCriticalResources(MachineInstr &MI) {
+ unsigned SchedClass = MI.getDesc().getSchedClass();
+ for (const InstrStage *IS = InstrItins->beginStage(SchedClass),
+ *IE = InstrItins->endStage(SchedClass);
+ IS != IE; ++IS) {
+ unsigned FuncUnits = IS->getUnits();
+ if (llvm::popcount(FuncUnits) == 1)
+ Resources[FuncUnits]++;
+ }
+ }
+
+ /// Return true if IS1 has less priority than IS2.
+ bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
+ unsigned F1 = 0, F2 = 0;
+ unsigned MFUs1 = minFuncUnits(IS1, F1);
+ unsigned MFUs2 = minFuncUnits(IS2, F2);
+ if (MFUs1 == 1 && MFUs2 == 1)
+ return Resources.lookup(F1) < Resources.lookup(F2);
+ return MFUs1 > MFUs2;
+ }
+};
+
+bool MachineUnrollerPass::tryToUnrollLoop(MachineLoop &L) {
+ bool Changed = false;
+ for (auto &InnerLoop : L)
+ Changed |= tryToUnrollLoop(*InnerLoop);
+
+#ifndef NDEBUG
+ // Stop trying after reaching the limit (if any).
+ int Limit = UnrollerLimit;
+ if (Limit >= 0) {
+ if (NumTries >= UnrollerLimit)
+ return Changed;
+ NumTries++;
+ }
+#endif
+
+ if (!canUnrollLoop(&L))
+ return Changed;
+
+ Changed = unrollLoop(&L, 1);
+ return Changed;
+}
+
+bool MachineUnrollerPass::canUnrollLoop(MachineLoop *L) {
+ // Only loops with a single basic block are handled. Also, the loop must
+ // be analyzable using analyzeBranch. It's the responsibility of the caller of
+ // this function to make sure that these requirement are met.
+ if (L->getNumBlocks() > 1) {
+ LLVM_DEBUG(
+ dbgs() << "Only loops with single basic block can be unrolled!!");
+ return false;
+ }
+
+ return true;
+}
+
+bool MachineUnrollerPass::unrollLoop(MachineLoop *L, unsigned UnrollFactor) {
+ MachineUnrollerSchedDAG MSD(*this, *L, ORE);
+
+ MachineBasicBlock *MBB = L->getHeader();
+ // The kernel should not include any terminator instructions. These
+ // will be added back later.
+ MSD.startBlock(MBB);
+ unsigned size = MBB->size();
+ for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
+ E = MBB->instr_end();
+ I != E; ++I, --size)
+ ;
+
+ MSD.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
+ MSD.schedule();
+ UnrollFactor = MSD.getUnrollFactor();
+ bool Changed = false;
+ if (UnrollFactor > 1)
+ Changed = Unroller->unroll(L, UnrollFactor);
+ MSD.exitRegion();
+ return Changed;
+}
+
+void MachineUnrollerSchedDAG::schedule() {
+ AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
+ buildSchedGraph(AA);
+}
+
+static unsigned getNonDebugMBBSize(MachineBasicBlock *MBB) {
+ int size = 0;
+ for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
+ E = MBB->getFirstTerminator();
+ I != E; ++I) {
+ if (!I->isDebugInstr())
+ size++;
+ }
+ return size;
+}
+
+// Check if their is a self register dependence between same instruction across
+// iterations.
+void checkSelfDependence(MachineLoop &Loop, MISet &selfDepInstr) {
+ MachineBasicBlock *MBB = Loop.getHeader();
+ // Track Register Dependencies from PHI to Inst or from Inst to PHI
+ std::map<std::pair<MachineInstr *, MachineInstr *>, bool> deps;
+ // Registers defined by PHI Node
+ std::map<Register, MachineInstr *> phiDefs;
+ // Registers used by PHI Node
+ std::map<Register, std::vector<MachineInstr *>> phiUses;
+ // Populate phiDefs and phiUses
+ for (MachineBasicBlock::iterator I = MBB->instr_begin(),
+ E = MBB->getFirstNonPHI();
+ I != E; ++I) {
+ for (MachineOperand MO : I->operands()) {
+ if (!MO.isReg())
+ continue;
+ if (MO.isDef())
+ phiDefs[MO.getReg()] = &*I;
+ else
+ phiUses[MO.getReg()].push_back(&*I);
+ }
+ }
+ // Self Dependency: Check for Instructions which define an operand used by
+ // PHI node and use an operand defined by PHI Node
+ for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
+ E = MBB->getFirstTerminator();
+ I != E; ++I) {
+ for (MachineOperand MO : I->operands()) {
+ if (!MO.isReg())
+ continue;
+ Register r = MO.getReg();
+ if (MO.isUse() && phiDefs.find(r) != phiDefs.end()) {
+ // Edge from PHI to Instruction
+ deps[{phiDefs[r], &*I}] = true;
+ if (deps.find({&*I, phiDefs[r]}) != deps.end())
+ selfDepInstr.insert(&*I);
+ } else if (MO.isDef() && phiUses.find(r) != phiUses.end()) {
+ // Edge from Instruction to PHI
+ for (MachineInstr *phi : phiUses[r]) {
+ deps[{&*I, phi}] = true;
+ if (deps.find({phi, &*I}) != deps.end())
+ selfDepInstr.insert(&*I);
+ }
+ }
+ }
+ }
+}
+
+// Do not unroll if the following conditions are true:
+// 1. There exists a self dependent instruction with latency >= MinResMII.
+// 2. No non-self dependent instruction has latency > MinResMII.
+// 3. Atleast half of the instructions in the loop are independent.
+bool MachineUnrollerSchedDAG::shouldNotUnroll(MachineLoop &Loop, int MinResMII,
+ MISet &selfDepInstr) {
+ MachineBasicBlock *MBB = Loop.getHeader();
+ MachineFunction *MF = MBB->getParent();
+ const InstrItineraryData *InstrItins =
+ MF->getSubtarget().getInstrItineraryData();
+ std::map<MachineInstr *, int> Latencies;
+ bool ShouldNotUnroll = false;
+ unsigned NonPhiInst = 0;
+ // Calculate latency for each instruction in the loop.
+ for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
+ E = MBB->getFirstTerminator();
+ I != E; ++I) {
+ LLVM_DEBUG({
+ dbgs() << "Instr = ";
+ I->dump();
+ });
+
+ if (I->isDebugInstr())
+ continue;
+ // Find the latency of each use-operand in the instruction.
+ NonPhiInst++;
+ for (MachineOperand MO : I->uses()) {
+ if (!MO.isReg() || MO.isImplicit() || MO.getReg().isPhysical())
+ continue;
+ MachineInstr *MIDef = Pass.MF->getRegInfo().getVRegDef(MO.getReg());
+ if (!MIDef)
+ continue;
+ int RegDefIdx =
+ MIDef->findRegisterDefOperandIdx(MO.getReg(), /*TRI=*/nullptr);
+ int RegUseIdx =
+ I->findRegisterUseOperandIdx(MO.getReg(), /*TRI=*/nullptr);
+ std::optional<unsigned> Curr =
+ TII->getOperandLatency(InstrItins, *MIDef, RegDefIdx, *I, RegUseIdx);
+ // If the latency calculated is null then set the operator latency to 1 or
+ // retain the value if it already exists.
+ if (Latencies.find(&*I) == Latencies.end()) {
+ if (Curr.has_value()) {
+ Latencies[&*I] = std::max((int)(*Curr), 1);
+ } else {
+ Latencies[&*I] = 1;
+ }
+ } else if (Curr.has_value()) {
+ Latencies[&*I] = std::max(Latencies[&*I], (int)(*Curr));
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "\tOperand = ";
+ MO.print(dbgs());
+ dbgs() << ",\tLatency = " << Latencies[&*I] << "\n";
+ });
+
+ if (Latencies[&*I] >= MinResMII) {
+ if (selfDepInstr.find(&*I) != selfDepInstr.end())
+ // Self dependent instruction with latency >= MinResMII and
+ // atleast half of the instructions in the loop are independent.
+ ShouldNotUnroll |= ((Latencies[&*I] >= MinResMII) &&
+ (selfDepInstr.size() * 2 > NonPhiInst));
+ else {
+ // No non-self dependent instruction should have latency > MinResMII.
+ return false;
+ }
+ }
+ }
+ }
+ return ShouldNotUnroll;
+}
+
+unsigned MachineUnrollerSchedDAG::getUnrollFactor() {
+ unsigned InitialResMII = calculateResMII(1);
+ InitialResMII = adjustResMIIForExtraCopies(InitialResMII);
+ unsigned MinResMII = InitialResMII;
+ unsigned MinUnrollFactor = 1;
+ unsigned UnrollThres = 4;
+ unsigned LoopHeaderSize = getNonDebugMBBSize(Loop.getHeader());
+
+ // Check for instruction self dependencies
+ MISet selfDepInstr;
+ checkSelfDependence(Loop, selfDepInstr);
+ LLVM_DEBUG(dbgs() << "Self Dependent Inst count = " << selfDepInstr.size()
+ << "\n");
+ if (shouldNotUnroll(Loop, MinResMII, selfDepInstr)) {
+ LLVM_DEBUG(dbgs() << "Self Dependencies Found. Using unroll factor = 1\n");
+ ORE->emit([&]() {
+ return MachineOptimizationRemarkMissed(
+ DEBUG_TYPE, "SelfDependency",
+ Loop.getHeader()->front().getDebugLoc(), Loop.getHeader())
+ << "Unable to unroll loop: self dependencies found";
+ });
+ return 1;
+ }
+
+ bool AnyBenefit = false;
+ bool AllBeneficialExceededThreshold = true;
+ for (unsigned i = 2; i <= UnrollThres; i += 2) {
+ unsigned UnrollResMII = calculateResMII(i);
+ LLVM_DEBUG(dbgs() << "Unroll Factor = " << i << "(res=" << UnrollResMII
+ << ")\n");
+ float UnrollResMIIRatio = (float)UnrollResMII / i;
+ float MinResMIIRatio = (float)MinResMII / MinUnrollFactor;
+
+ if (UnrollResMIIRatio < MinResMIIRatio) {
+ AnyBenefit = true;
+ if ((LoopHeaderSize * i) <= MachineUnrollerThres) {
+ AllBeneficialExceededThreshold = false;
+ MinResMII = UnrollResMII;
+ MinUnrollFactor = i;
+ } else {
+ LLVM_DEBUG(dbgs() << "Loop size " << (LoopHeaderSize * i)
+ << " exceeds threshold " << MachineUnrollerThres
+ << " for factor " << i << "\n");
+ ORE->emit([&]() {
+ return MachineOptimizationRemarkMissed(
+ DEBUG_TYPE, "SizeLimit",
+ Loop.getHeader()->front().getDebugLoc(), Loop.getHeader())
+ << "Unable to unroll loop by factor " << NV("Factor", i)
+ << ": unrolled size " << NV("Size", LoopHeaderSize * i)
+ << " exceeds threshold "
+ << NV("Threshold", (unsigned)MachineUnrollerThres);
+ });
+ }
+ } else {
+ LLVM_DEBUG(dbgs() << "Unroll factor " << i
+ << " did not improve ResMII\n");
+ }
+ }
+
+ if (MinUnrollFactor > 1) {
+ ORE->emit([&]() {
+ return MachineOptimizationRemark(DEBUG_TYPE, "Unrolled",
+ Loop.getHeader()->front().getDebugLoc(),
+ Loop.getHeader())
+ << "Unrolled loop by factor "
+ << NV("UnrollFactor", MinUnrollFactor) << " (ResMII improved from "
+ << NV("InitialResMII", InitialResMII) << " to "
+ << NV("FinalResMII", MinResMII) << ")";
+ });
+ } else if (AnyBenefit && AllBeneficialExceededThreshold) {
+ ORE->emit([&]() {
+ return MachineOptimizationRemarkMissed(
+ DEBUG_TYPE, "AllExceededThreshold",
+ Loop.getHeader()->front().getDebugLoc(), Loop.getHeader())
+ << "Unable to unroll loop: all beneficial factors exceeded size "
+ "threshold";
+ });
+ } else if (!AnyBenefit) {
+ ORE->emit([&]() {
+ return MachineOptimizationRemarkMissed(
+ DEBUG_TYPE, "NoBenefit",
+ Loop.getHeader()->front().getDebugLoc(), Loop.getHeader())
+ << "Unable to unroll loop: unrolling does not improve ResMII";
+ });
+ }
+
+ LLVM_DEBUG(dbgs() << "Using unroll factor of " << MinUnrollFactor << "\n");
+ return MinUnrollFactor;
+}
+
+unsigned MachineUnrollerSchedDAG::calculateResMII(unsigned UnrollFactor) {
+ SmallVector<DFAPacketizer *, 8> Resources;
+ MachineBasicBlock *MBB = Loop.getHeader();
+ Resources.push_back(TII->CreateTargetScheduleState(MF.getSubtarget()));
+
+ // Sort the instructions by the number of available choices for scheduling,
+ // least to most. Use the number of critical resources as the tie breaker.
+ FuncUnitSorter FUS =
+ FuncUnitSorter(MF.getSubtarget().getInstrItineraryData());
+ for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
+ E = MBB->getFirstTerminator();
+ I != E; ++I)
+ FUS.calcCriticalResources(*I);
+ PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
+ FuncUnitOrder(FUS);
+
+ // To compute ResMII for the unrolled loop, simply replicate instructions as
+ // many times as the unroll factor.
+ for (unsigned i = 0; i < UnrollFactor; i++) {
+ for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
+ E = MBB->getFirstTerminator();
+ I != E; ++I)
+ FuncUnitOrder.push(&*I);
+ }
+ while (!FuncUnitOrder.empty()) {
+ MachineInstr *MI = FuncUnitOrder.top();
+ FuncUnitOrder.pop();
+ if (TII->isZeroCost(MI->getOpcode()))
+ continue;
+ // Attempt to reserve the instruction in an existing DFA. At least one
+ // DFA is needed for each cycle.
+ unsigned NumCycles = 1;
+ unsigned ReservedCycles = 0;
+ SmallVectorImpl<DFAPacketizer *>::iterator RI = Resources.begin();
+ SmallVectorImpl<DFAPacketizer *>::iterator RE = Resources.end();
+ for (unsigned C = 0; C < NumCycles; ++C)
+ while (RI != RE) {
+ if ((*RI++)->canReserveResources(*MI)) {
+ ++ReservedCycles;
+ break;
+ }
+ }
+ // Start reserving resources using existing DFAs.
+ for (unsigned C = 0; C < ReservedCycles; ++C) {
+ --RI;
+ (*RI)->reserveResources(*MI);
+ }
+ // Add new DFAs, if needed, to reserve resources.
+ for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
+ DFAPacketizer *NewResource =
+ TII->CreateTargetScheduleState(MF.getSubtarget());
+ assert(NewResource->canReserveResources(*MI) && "Reserve error.");
+ NewResource->reserveResources(*MI);
+ Resources.push_back(NewResource);
+ }
+ }
+ int Resmii = Resources.size();
+ // Delete the memory for each of the DFAs that were created earlier.
+ for (DFAPacketizer *RI : Resources) {
+ DFAPacketizer *D = RI;
+ delete D;
+ }
+ Resources.clear();
+ return Resmii;
+}
+
+/// Adjust starting ResMII if latency between any of the instructions in
+/// the loop header happens to be higher than the previously computed value
+/// which is passed as the input parameter. This is done to account for
+/// the extra copies and therefore resources that are needed when the loop
+/// is software pipelined later on. One thing to note here is that even if
+/// the pipeliner is able to find a schedule with the original ResMII,
+/// the high latencies between the instructions will always cause stalls.
+/// Identifying such loops here and unrolling them can help the pipeliner
+/// generate better schedule with fewer stalls.
+unsigned MachineUnrollerSchedDAG::adjustResMIIForExtraCopies(unsigned ResMII) {
+ unsigned MinResMII = ResMII;
+ MachineBasicBlock *MBB = Loop.getHeader();
+ for (auto &MI :
+ make_range(MBB->getFirstNonPHI(), MBB->getFirstTerminator())) {
+ if (MI.isDebugInstr())
+ continue;
+ SUnit *SU = getSUnit(&MI);
+ for (auto &Dep : SU->Succs) {
+ if (Dep.getSUnit() == SU)
+ continue;
+ if (Dep.getKind() != SDep::Data)
+ continue;
+ unsigned Latency = Dep.getLatency();
+ if (Latency > MinResMII)
+ MinResMII = Latency;
+ }
+ }
+ return MinResMII;
+}
+
+bool MachineUnrollerPass::runOnMachineFunction(MachineFunction &mf) {
+ if (skipFunction(mf.getFunction()))
+ return false;
+
+ if (!EnableMIUnroller)
+ return false;
+
+ if (mf.getFunction().getAttributes().hasAttributeAtIndex(
+ AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&
+ !EnableMIUnrollerOptSize.getPosition())
+ return false;
+
+ MF = &mf;
+ MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+ TII = MF->getSubtarget().getInstrInfo();
+ MRI = &MF->getRegInfo();
+ PassConfig = &getAnalysis<TargetPassConfig>();
+ ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
+ Unroller = PassConfig->createMachineUnroller(this);
+ if (!Unroller)
+ return false;
+
+ bool Changed = false;
+ for (auto &L : *MLI)
+ Changed |= tryToUnrollLoop(*L);
+
+ delete Unroller;
+ return Changed;
+}
diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index 85ec5c7c2d45e..a78803a6a903f 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -48,6 +48,7 @@ add_llvm_target(HexagonCodeGen
HexagonLoopAlign.cpp
HexagonLoopIdiomRecognition.cpp
HexagonMachineFunctionInfo.cpp
+ HexagonMachineUnroller.cpp
HexagonMachineScheduler.cpp
HexagonMask.cpp
HexagonMCInstLower.cpp
diff --git a/llvm/lib/Target/Hexagon/HexagonMachineUnroller.cpp b/llvm/lib/Target/Hexagon/HexagonMachineUnroller.cpp
new file mode 100644
index 0000000000000..adac5660b05dc
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonMachineUnroller.cpp
@@ -0,0 +1,148 @@
+//===- HexagonMachineUnroller.cpp - Hexagon machine unroller --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Hexagon-specific implementation of machine loop unrolling.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMachineUnroller.h"
+#include "HexagonInstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineUnroller.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-machine-unroller"
+
+static bool executesAtMostOnce(MachineInstr *MI) {
+ if (MI->getOpcode() != Hexagon::A2_andir)
+ return false;
+ if (MI->getOperand(2).getImm() == 1)
+ return true;
+ return false;
+}
+
+unsigned HexagonMachineUnroller::getLoopCount(MachineBasicBlock &LoopBB) const {
+ // We expect a hardware loop currently. This means that IndVar is set
+ // to null, and the compare is the ENDLOOP instruction.
+ MachineBasicBlock::iterator I = LoopBB.getFirstTerminator();
+ assert(I != LoopBB.end() && HII->isEndLoopN(I->getOpcode()) &&
+ "Expecting a hardware loop");
+ DebugLoc DL = I->getDebugLoc();
+ SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
+ MachineInstr *Loop = HII->findLoopInstr(
+ &LoopBB, I->getOpcode(), I->getOperand(0).getMBB(), VisitedBBs);
+ if (!Loop)
+ return 0;
+ // The loop trip count is a compile-time value.
+ if (Loop->getOpcode() == Hexagon::J2_loop0i ||
+ Loop->getOpcode() == Hexagon::J2_loop1i) {
+ LLVM_DEBUG(
+ dbgs() << "HexagonMachineUnroller: Found compile-time loop count: "
+ << Loop->getOperand(1).getImm() << "\n");
+ return Loop->getOperand(1).getImm();
+ }
+
+ // The loop trip count is a run-time value.
+ assert(Loop->getOpcode() == Hexagon::J2_loop0r && "Unexpected instruction");
+ LLVM_DEBUG(
+ dbgs() << "HexagonMachineUnroller: Found run-time loop count in reg "
+ << Loop->getOperand(1).getReg() << "\n");
+ return Loop->getOperand(1).getReg();
+}
+
+unsigned HexagonMachineUnroller::addUnrolledLoopCountMI(
+ MachineBasicBlock &MBB, unsigned LC, unsigned UnrollFactor) const {
+ assert(isPowerOf2_32(UnrollFactor) && "UnrollFactor must be a power of 2");
+ MachineFunction *MF = MBB.getParent();
+ unsigned ShiftBy = Log2_32(UnrollFactor);
+ unsigned NewUnrolledLC = HII->createVR(MF, MVT::i32);
+ BuildMI(MBB, MBB.instr_end(), DebugLoc(), HII->get(Hexagon::S2_lsr_i_r),
+ NewUnrolledLC)
+ .addReg(LC)
+ .addImm(ShiftBy);
+ return NewUnrolledLC;
+}
+
+unsigned
+HexagonMachineUnroller::addRemLoopCountMI(MachineBasicBlock &MBB, unsigned LC,
+ unsigned UnrollFactor) const {
+ assert(isPowerOf2_32(UnrollFactor) && "UnrollFactor must be a power of 2");
+ MachineFunction *MF = MBB.getParent();
+ unsigned RemLC = HII->createVR(MF, MVT::i32);
+ BuildMI(MBB, MBB.instr_end(), DebugLoc(), HII->get(Hexagon::A2_andir), RemLC)
+ .addReg(LC)
+ .addImm(UnrollFactor - 1);
+ return RemLC;
+}
+
+void HexagonMachineUnroller::changeLoopCount(
+ MachineBasicBlock &BB, MachineBasicBlock &Preheader,
+ MachineBasicBlock &Header, MachineBasicBlock &LoopBB, unsigned LC,
+ SmallVectorImpl<MachineOperand> &Cond) const {
+
+ // We expect a hardware loop currently. This means that IndVar is set
+ // to null, and the compare is the ENDLOOP instruction.
+ MachineBasicBlock::iterator I = LoopBB.getFirstTerminator();
+ assert(I != LoopBB.end() && HII->isEndLoopN(I->getOpcode()) &&
+ "Expecting a hardware loop");
+ DebugLoc DL = I->getDebugLoc();
+ SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
+ MachineInstr *Loop = HII->findLoopInstr(
+ &Header, I->getOpcode(), I->getOperand(0).getMBB(), VisitedBBs);
+ if (!Loop) {
+ LLVM_DEBUG(
+ dbgs() << "HexagonMachineUnroller: Loop instruction not found\n");
+ return;
+ }
+ // The loop trip count is a run-time value.
+ if (Loop->getOpcode() != Hexagon::J2_loop0r) {
+ LLVM_DEBUG(dbgs() << "HexagonMachineUnroller: Unexpected loop opcode: "
+ << Loop->getOpcode() << "\n");
+ return;
+ }
+ MachineRegisterInfo &MRI = I->getParent()->getParent()->getRegInfo();
+ MachineInstr *LCDefMI = MRI.getVRegDef(LC);
+ MachineInstr *NewCmp;
+ if (executesAtMostOnce(LCDefMI)) {
+ // The loop executes at most once. Therefore, it must be unrolled
+ // by removing loop setup, endloop and back-edge (jump) instruction to avoid
+ // stalls due to front-end mispredictions.
+ // FYI: the front end predicts endloop is taken twice and then waits to see
+ // which way it goes when it encounters it a third time. Since loop[01] is
+ // resolved by the back-end and it takes at least 10 cycles from fetch to
+ // commit, for the very small loops that execute only once, it can result
+ // into a lot of stalled cycles.
+ unsigned LoopEnd = HII->createVR(MF, MVT::i1);
+ NewCmp = BuildMI(&BB, DL, HII->get(Hexagon::C2_cmpgtui), LoopEnd)
+ .addReg(LC)
+ .addImm(0);
+ I->eraseFromParent();
+ Header.removeSuccessor(&Header);
+ } else {
+ unsigned LoopEnd = HII->createVR(MF, MVT::i1);
+ NewCmp = BuildMI(&BB, DL, HII->get(Hexagon::C2_cmpgtui), LoopEnd)
+ .addReg(LC)
+ .addImm(0);
+ BuildMI(&Preheader, DL, HII->get(Hexagon::J2_loop0r))
+ .addMBB(Loop->getOperand(0).getMBB())
+ .addReg(LC);
+ }
+ // Delete the old loop instruction.
+ Loop->eraseFromParent();
+ Cond.push_back(MachineOperand::CreateImm(Hexagon::J2_jumpf));
+ Cond.push_back(NewCmp->getOperand(0));
+ LLVM_DEBUG(dbgs() << "HexagonMachineUnroller: Updated loop count mechanism "
+ "for unrolled loop.\n");
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonMachineUnroller.h b/llvm/lib/Target/Hexagon/HexagonMachineUnroller.h
new file mode 100644
index 0000000000000..ae603b840e5a7
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonMachineUnroller.h
@@ -0,0 +1,46 @@
+//===- HexagonMachineUnroller.h - Hexagon machine unroller ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Hexagon-specific implementation of machine loop unrolling.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINEUNROLLER_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINEUNROLLER_H
+
+#include "HexagonInstrInfo.h"
+#include "llvm/CodeGen/MachineUnroller.h"
+
+namespace llvm {
+
+class HexagonMachineUnroller : public MachineUnroller {
+ const HexagonInstrInfo *HII;
+
+public:
+ HexagonMachineUnroller(MachineUnrollerContext *C) : MachineUnroller(C) {
+ HII = static_cast<const HexagonInstrInfo *>(C->TII);
+ }
+
+ unsigned getLoopCount(MachineBasicBlock &MBB) const override;
+
+ /// Add instruction to compute trip count for the unrolled loop.
+ unsigned addUnrolledLoopCountMI(MachineBasicBlock &MBB, unsigned LC,
+ unsigned UnrollFactor) const override;
+
+ /// Add instruction to compute remainder trip count for the unrolled loop.
+ unsigned addRemLoopCountMI(MachineBasicBlock &MBB, unsigned LC,
+ unsigned UnrollFactor) const override;
+
+ void changeLoopCount(MachineBasicBlock &BB, MachineBasicBlock &Preheader,
+ MachineBasicBlock &Header, MachineBasicBlock &LoopBB,
+ unsigned LC,
+ SmallVectorImpl<MachineOperand> &Cond) const override;
+};
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINEUNROLLER_H
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 2f14622cab57c..e092f60b101ff 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -16,6 +16,7 @@
#include "HexagonLoopIdiomRecognition.h"
#include "HexagonMachineFunctionInfo.h"
#include "HexagonMachineScheduler.h"
+#include "HexagonMachineUnroller.h"
#include "HexagonTargetObjectFile.h"
#include "HexagonTargetTransformInfo.h"
#include "HexagonVectorLoopCarriedReuse.h"
@@ -312,6 +313,11 @@ class HexagonPassConfig : public TargetPassConfig {
return getTM<HexagonTargetMachine>();
}
+ MachineUnroller *
+ createMachineUnroller(MachineUnrollerContext *C) const override {
+ return new HexagonMachineUnroller(C);
+ }
+
void addIRPasses() override;
bool addInstSelector() override;
bool addILPOpts() override;
@@ -425,6 +431,8 @@ void HexagonPassConfig::addPreRegAlloc() {
if (!DisableHardwareLoops)
addPass(createHexagonHardwareLoops());
}
+ if (getOptLevel() == CodeGenOptLevel::Aggressive)
+ addPass(&MachineUnrollerPassID);
if (TM->getOptLevel() >= CodeGenOptLevel::Default)
addPass(&MachinePipelinerID);
}
diff --git a/llvm/test/CodeGen/Hexagon/dbg-instr-machunroll.ll b/llvm/test/CodeGen/Hexagon/dbg-instr-machunroll.ll
new file mode 100644
index 0000000000000..635a17558a1b4
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/dbg-instr-machunroll.ll
@@ -0,0 +1,141 @@
+; RUN: llc -march=hexagon -O3 -o /dev/null < %s 2>&1
+; Test that the compiler doesn't seg fault due to DBG_LABEL or DBG_VALUE
+; in Machine Unroller pass.
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #0
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #0
+
+; Function Attrs: nounwind
+define dso_local void @fw_time_wait_us(i32 %us) local_unnamed_addr #2 !dbg !13 {
+entry:
+ %dummy = alloca [256 x i8], align 8
+ call void @llvm.dbg.value(metadata i32 %us, metadata !18, metadata !DIExpression()), !dbg !27
+ call void @llvm.dbg.value(metadata i32 %us, metadata !19, metadata !DIExpression(DW_OP_constu, 1000, DW_OP_mul, DW_OP_stack_value)), !dbg !27
+ call void @llvm.dbg.value(metadata i32 0, metadata !21, metadata !DIExpression()), !dbg !27
+ %0 = getelementptr inbounds [256 x i8], [256 x i8]* %dummy, i32 0, i32 0, !dbg !28
+ call void @llvm.lifetime.start.p0i8(i64 256, i8* nonnull %0) #3, !dbg !28
+ call void @llvm.dbg.declare(metadata [256 x i8]* %dummy, metadata !22, metadata !DIExpression()), !dbg !29
+ %1 = tail call i32 asm sideeffect "$0 = pcyclelo", "=r"() #3, !dbg !30, !srcloc !31
+ call void @llvm.dbg.value(metadata i32 %1, metadata !21, metadata !DIExpression()), !dbg !27
+ call void @llvm.dbg.value(metadata i32 %us, metadata !19, metadata !DIExpression(DW_OP_constu, 1000, DW_OP_mul, DW_OP_stack_value)), !dbg !27
+ %cmp10 = icmp sgt i32 %us, 0, !dbg !32
+ br i1 %cmp10, label %do.body.preheader.preheader, label %while.end, !dbg !34
+
+do.body.preheader.preheader: ; preds = %entry
+ %mul = mul nsw i32 %us, 1000, !dbg !35
+ call void @llvm.dbg.value(metadata i32 %mul, metadata !19, metadata !DIExpression()), !dbg !27
+ call void @llvm.dbg.value(metadata i32 %mul, metadata !19, metadata !DIExpression()), !dbg !27
+ br label %do.body.preheader, !dbg !36
+
+do.body.preheader: ; preds = %do.end, %do.body.preheader.preheader
+ %count.011 = phi i32 [ %dec, %do.end ], [ %mul, %do.body.preheader.preheader ]
+ call void @llvm.dbg.value(metadata i32 %count.011, metadata !19, metadata !DIExpression()), !dbg !27
+ br label %do.body, !dbg !37
+
+do.body: ; preds = %do.body, %do.body.preheader
+ %2 = tail call i32 asm sideeffect "$0 = pcyclelo", "=r"() #3, !dbg !39, !srcloc !41
+ call void @llvm.dbg.value(metadata i32 %2, metadata !20, metadata !DIExpression()), !dbg !27
+ %and = and i32 %2, 256, !dbg !42
+ %arrayidx = getelementptr inbounds [256 x i8], [256 x i8]* %dummy, i32 0, i32 %and, !dbg !43
+ %3 = load i8, i8* %arrayidx, align 8, !dbg !44, !tbaa !45
+ %inc = add i8 %3, 1, !dbg !44
+ store i8 %inc, i8* %arrayidx, align 8, !dbg !44, !tbaa !45
+ %div = lshr i32 %2, 8, !dbg !48
+ %rem = and i32 %div, 255, !dbg !49
+ %arrayidx1 = getelementptr inbounds [256 x i8], [256 x i8]* %dummy, i32 0, i32 %rem, !dbg !50
+ %4 = load i8, i8* %arrayidx1, align 1, !dbg !51, !tbaa !45
+ %inc2 = add i8 %4, 1, !dbg !51
+ store i8 %inc2, i8* %arrayidx1, align 1, !dbg !51, !tbaa !45
+ %cmp3 = icmp ugt i32 %2, %1, !dbg !52
+ br i1 %cmp3, label %do.end, label %do.body, !dbg !53, !llvm.loop !55
+
+do.end: ; preds = %do.body
+ %dec = add nsw i32 %count.011, -1, !dbg !57
+ call void @llvm.dbg.value(metadata i32 %dec, metadata !19, metadata !DIExpression()), !dbg !27
+ %cmp = icmp sgt i32 %dec, 0, !dbg !32
+ br i1 %cmp, label %do.body.preheader, label %while.end, !dbg !34, !llvm.loop !58
+
+while.end: ; preds = %do.end, %entry
+ call void @llvm.lifetime.end.p0i8(i64 256, i8* nonnull %0) #3, !dbg !60
+ ret void, !dbg !60
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { argmemonly nounwind willreturn }
+attributes #1 = { nounwind readnone speculatable willreturn }
+attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv71" "target-features"="+v71,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10, !11}
+!llvm.ident = !{!12}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "QuIC LLVM Hexagon Clang version 8.4.alpha4 Engineering Release: hexagon-clang-mono-84-2032 (based on LLVM 10.0.0)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, globals: !2, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "/prj/dsp/qdsp6/austin/builds/hexbuild/test_trees/MASTER/test/regress/simulator/api_tests/NMI/test.c", directory: "/local/mnt/workspace")
+!2 = !{}
+!3 = !{!4, !6, !8}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 32)
+!5 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!6 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 32)
+!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 32)
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{i32 1, !"wchar_size", i32 4}
+!12 = !{!"QuIC LLVM Hexagon Clang version 8.4.alpha4 Engineering Release: hexagon-clang-mono-84-2032 (based on LLVM 10.0.0)"}
+!13 = distinct !DISubprogram(name: "fw_time_wait_us", scope: !14, file: !14, line: 71, type: !15, scopeLine: 72, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!14 = !DIFile(filename: "/prj/dsp/qdsp6/austin/builds/hexbuild/test_trees/MASTER/test/regress/simulator/api_tests/NMI/test.c", directory: "")
+!15 = !DISubroutineType(types: !16)
+!16 = !{null, !7}
+!17 = !{!18, !19, !20, !21, !22}
+!18 = !DILocalVariable(name: "us", arg: 1, scope: !13, file: !14, line: 71, type: !7)
+!19 = !DILocalVariable(name: "count", scope: !13, file: !14, line: 73, type: !7)
+!20 = !DILocalVariable(name: "xo", scope: !13, file: !14, line: 74, type: !5)
+!21 = !DILocalVariable(name: "base", scope: !13, file: !14, line: 74, type: !5)
+!22 = !DILocalVariable(name: "dummy", scope: !13, file: !14, line: 75, type: !23)
+!23 = !DICompositeType(tag: DW_TAG_array_type, baseType: !24, size: 2048, elements: !25)
+!24 = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char)
+!25 = !{!26}
+!26 = !DISubrange(count: 256)
+!27 = !DILocation(line: 0, scope: !13)
+!28 = !DILocation(line: 75, column: 3, scope: !13)
+!29 = !DILocation(line: 75, column: 17, scope: !13)
+!30 = !DILocation(line: 77, column: 3, scope: !13)
+!31 = !{i32 1828}
+!32 = !DILocation(line: 78, column: 16, scope: !33)
+!33 = !DILexicalBlockFile(scope: !13, file: !14, discriminator: 2)
+!34 = !DILocation(line: 78, column: 3, scope: !33)
+!35 = !DILocation(line: 73, column: 20, scope: !13)
+!36 = !DILocation(line: 78, column: 3, scope: !13)
+!37 = !DILocation(line: 80, column: 5, scope: !38)
+!38 = distinct !DILexicalBlock(scope: !13, file: !14, line: 79, column: 3)
+!39 = !DILocation(line: 82, column: 7, scope: !40)
+!40 = distinct !DILexicalBlock(scope: !38, file: !14, line: 81, column: 5)
+!41 = !{i32 1916}
+!42 = !DILocation(line: 84, column: 17, scope: !40)
+!43 = !DILocation(line: 84, column: 7, scope: !40)
+!44 = !DILocation(line: 84, column: 24, scope: !40)
+!45 = !{!46, !46, i64 0}
+!46 = !{!"omnipotent char", !47, i64 0}
+!47 = !{!"Simple C/C++ TBAA"}
+!48 = !DILocation(line: 85, column: 17, scope: !40)
+!49 = !DILocation(line: 85, column: 23, scope: !40)
+!50 = !DILocation(line: 85, column: 7, scope: !40)
+!51 = !DILocation(line: 85, column: 29, scope: !40)
+!52 = !DILocation(line: 87, column: 15, scope: !38)
+!53 = !DILocation(line: 86, column: 5, scope: !54)
+!54 = !DILexicalBlockFile(scope: !40, file: !14, discriminator: 2)
+!55 = distinct !{!55, !37, !56}
+!56 = !DILocation(line: 87, column: 22, scope: !38)
+!57 = !DILocation(line: 88, column: 10, scope: !38)
+!58 = distinct !{!58, !36, !59}
+!59 = !DILocation(line: 89, column: 3, scope: !13)
+!60 = !DILocation(line: 90, column: 1, scope: !13)
diff --git a/llvm/test/CodeGen/Hexagon/dbg-label-machunroll.ll b/llvm/test/CodeGen/Hexagon/dbg-label-machunroll.ll
new file mode 100644
index 0000000000000..213c2ec1a37a8
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/dbg-label-machunroll.ll
@@ -0,0 +1,61 @@
+; RUN: llc -O3 -march=hexagon < %s | FileCheck %s
+; CHECK-NOT: Segmentation
+
+target triple = "hexagon-unknown-unknown-elf"
+
+declare void @llvm.dbg.label(metadata)
+
+define hidden fastcc i8 @__pyx_f_4lxml_5etree__decodeFilenameWithLength(i8* %__pyx_v_c_path, i32 %__pyx_v_c_len) unnamed_addr {
+entry:
+ %add.ptr.i = getelementptr i8, i8* %__pyx_v_c_path, i32 1
+ %0 = load i8, i8* %__pyx_v_c_path, align 1
+ br label %while.cond.preheader.i
+
+while.cond.preheader.i: ; preds = %entry
+ %1 = load i8, i8* %add.ptr.i, align 1
+ %2 = and i8 %1, -33
+ %3 = add i8 %2, -65
+ %4 = icmp ult i8 %3, 26
+ br i1 %4, label %if.end101.i, label %if.end132.i
+
+if.end101.i: ; preds = %if.end101.i, %while.cond.preheader.i
+ %__pyx_v_c_path.addr.0223.i = phi i8* [ %add.ptr102.i, %if.end101.i ], [ %add.ptr.i, %while.cond.preheader.i ]
+ %add.ptr102.i = getelementptr i8, i8* %__pyx_v_c_path.addr.0223.i, i32 1
+ %.pr.i = load i8, i8* %add.ptr102.i, align 1
+ %5 = and i8 %.pr.i, -33
+ %6 = add i8 %5, -65
+ %7 = icmp ult i8 %6, 26
+ call void @llvm.dbg.label(metadata !23), !dbg !27
+ br i1 %7, label %if.end101.i, label %if.end132.i
+
+if.end132.i: ; preds = %while.cond.preheader.i
+ ret i8 %0
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "QuIC LLVM Hexagon Clang version hexagon-clang-84-1613 (based on LLVM 9.0.0)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "lxml.etree.c", directory: "/local/mnt/workspace/santdas/src/llvm/master/qtool/qtool-42625")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 7, !"PIC Level", i32 2}
+!9 = !DILabel(scope: !10, name: "__pyx_L5_bool_binop_done", file: !11, line: 32696)
+!10 = distinct !DISubprogram(name: "__pyx_f_4lxml_5etree__isFilePath", scope: !11, file: !11, line: 32628, type: !12, scopeLine: 32628, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!11 = !DIFile(filename: "src/lxml/lxml.etree.c", directory: "/local/mnt/workspace/santdas/src/llvm/master/qtool/qtool-42625")
+!12 = !DISubroutineType(types: !13)
+!13 = !{!14, !15}
+!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 32)
+!16 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !17)
+!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "xmlChar", file: !18, line: 28, baseType: !19)
+!18 = !DIFile(filename: "scratch/buildroot/buildroot/output/host/usr/hexagon-buildroot-linux-musl/sysroot/usr/include/libxml2/libxml/xmlstring.h", directory: "/local/mnt/workspace")
+!19 = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char)
+!21 = distinct !DILocation(line: 33263, column: 16, scope: !22)
+!22 = distinct !DISubprogram(name: "__pyx_f_4lxml_5etree__decodeFilenameWithLength", scope: !11, file: !11, line: 33236, type: !12, scopeLine: 33236, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!23 = !DILabel(scope: !24, name: "__pyx_L12_bool_binop_done", file: !11, line: 32782)
+!24 = distinct !DILexicalBlock(scope: !25, file: !11, line: 32765, column: 15)
+!25 = distinct !DILexicalBlock(scope: !26, file: !11, line: 32697, column: 18)
+!26 = distinct !DILexicalBlock(scope: !10, file: !11, line: 32697, column: 7)
+!27 = !DILocation(line: 32782, column: 7, scope: !24, inlinedAt: !21)
\ No newline at end of file
diff --git a/llvm/test/CodeGen/Hexagon/machine-unroller-remarks.ll b/llvm/test/CodeGen/Hexagon/machine-unroller-remarks.ll
new file mode 100644
index 0000000000000..9bf02ea82e999
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/machine-unroller-remarks.ll
@@ -0,0 +1,48 @@
+; RUN: llc -march=hexagon -O3 -enable-machine-unroller \
+; RUN: -pass-remarks=machine-unroller -pass-remarks-missed=machine-unroller \
+; RUN: < %s 2>&1 | FileCheck %s
+; CHECK: remark: {{.*}}: Unrolled loop by factor 2 (ResMII improved from {{[0-9]+}} to {{[0-9]+}})
+
+; RUN: llc -march=hexagon -O3 -enable-machine-unroller \
+; RUN: -machine-unroller-threshold=1 -pass-remarks=machine-unroller \
+; RUN: -pass-remarks-missed=machine-unroller < %s 2>&1 |\
+; RUN: FileCheck %s --check-prefix=MISSED
+; MISSED: remark: {{.*}}: Unable to unroll loop by factor 2: unrolled size {{[0-9]+}} exceeds threshold 1
+; MISSED: remark: {{.*}}: Unable to unroll loop by factor 4: unrolled size {{[0-9]+}} exceeds threshold 1
+
+define float @test(i32 %n, float %da, float* noalias nocapture readonly %dx, i32 %incx, float* noalias nocapture %dy, i32 %incy) local_unnamed_addr {
+entry:
+ %cmp = icmp slt i32 %n, 1
+ %cmp1 = fcmp oeq float %da, 0.000000e+00
+ %or.cond45 = or i1 %cmp, %cmp1
+ br i1 %or.cond45, label %if.then6, label %if.end3
+
+if.end3:
+ %cmp4 = icmp ne i32 %incx, 1
+ %cmp5 = icmp ne i32 %incy, 1
+ %or.cond = or i1 %cmp4, %cmp5
+ br i1 %or.cond, label %if.then6, label %for.body.lr.ph
+
+if.then6:
+ ret float 0.000000e+00
+
+for.body.lr.ph:
+ %0 = load float, float* %dy, align 4
+ br label %for.body
+
+for.body:
+ %arrayidx18.phi = phi float* [ %dx, %for.body.lr.ph ], [ %arrayidx18.inc, %for.body ]
+ %arrayidx21.phi = phi float* [ %dy, %for.body.lr.ph ], [ %arrayidx21.inc, %for.body ]
+ %i.047 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+ %1 = load float, float* %arrayidx18.phi, align 4
+ %mul19 = fmul float %1, %da
+ %add20 = fadd float %0, %mul19
+ store float %add20, float* %arrayidx21.phi, align 4
+ %inc = add nuw nsw i32 %i.047, 1
+ %exitcond = icmp eq i32 %inc, %n
+ %arrayidx18.inc = getelementptr float, float* %arrayidx18.phi, i32 32
+ %arrayidx21.inc = getelementptr float, float* %arrayidx21.phi, i32 32
+ br i1 %exitcond, label %if.then6, label %for.body
+}
+
+
diff --git a/llvm/test/CodeGen/Hexagon/miunroll-adjust-resmii.ll b/llvm/test/CodeGen/Hexagon/miunroll-adjust-resmii.ll
new file mode 100644
index 0000000000000..26a6d79625156
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/miunroll-adjust-resmii.ll
@@ -0,0 +1,48 @@
+; RUN: llc -O3 -march=hexagon -enable-machine-unroller=false < %s | FileCheck --check-prefix=CHECK-NO-UNROLL %s
+; RUN: llc -O3 -march=hexagon -enable-machine-unroller=true < %s | FileCheck --check-prefix=CHECK-UNROLL %s
+
+; Without the machine unroller, make sure that the inner most loop has only one sfmpy instruction.
+; CHECK-NO-UNROLL: loop0(.LBB0_[[LOOP:.]]
+; CHECK-NO-UNROLL: .LBB0_[[LOOP]]:
+; CHECK-NO-UNROLL: sfmpy
+; CHECK-NO-UNROLL-NOT: sfmpy
+; CHECK-NO-UNROLL: endloop0
+; CHECK-NO-UNROLL-NOT: loop0
+
+; When the machine unroller is enabled, the inner most loop in the test
+; gets unrolled by 2. Make sure that there are 2 sfmpy instructions
+; (one for each loop iteration) in the unrolled loop.
+
+; CHECK-UNROLL: loop0(.LBB0_[[LOOP:.]]
+; CHECK-UNROLL: .LBB0_[[LOOP]]:
+; CHECK-UNROLL: sfmpy
+; CHECK-UNROLL: sfmpy
+; CHECK-UNROLL-NOT: sfmpy
+; CHECK-UNROLL: } :endloop0
+
+; Function Attrs: noinline nounwind
+define dso_local void @test(ptr noalias nocapture readonly %in, ptr noalias nocapture %out, float %scale, i32 %n_samples) local_unnamed_addr {
+entry:
+ %cmp6 = icmp eq i32 %n_samples, 0
+ br i1 %cmp6, label %for.end, label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %arrayidx.phi = phi ptr [ %arrayidx.inc, %for.body ], [ %in, %entry ]
+ %arrayidx1.phi = phi ptr [ %arrayidx1.inc, %for.body ], [ %out, %entry ]
+ %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %0 = load i32, ptr %arrayidx.phi, align 4
+ %1 = tail call float @llvm.hexagon.F2.conv.w2sf(i32 %0)
+ %mul = fmul contract float %1, %scale
+ store float %mul, ptr %arrayidx1.phi, align 4
+ %inc = add nuw nsw i32 %i.07, 1
+ %exitcond = icmp eq i32 %inc, %n_samples
+ %arrayidx.inc = getelementptr i32, ptr %arrayidx.phi, i32 1
+ %arrayidx1.inc = getelementptr float, ptr %arrayidx1.phi, i32 1
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.hexagon.F2.conv.w2sf(i32)
diff --git a/llvm/test/CodeGen/Hexagon/miunroll-memoperand-size.mir b/llvm/test/CodeGen/Hexagon/miunroll-memoperand-size.mir
new file mode 100644
index 0000000000000..9f5bc0b1c70c5
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/miunroll-memoperand-size.mir
@@ -0,0 +1,37 @@
+# RUN: llc -mtriple=hexagon-unknown-elf -run-pass=machine-unroller %s -o - | FileCheck %s
+
+# Test that the machine unroller updates the memoperand with 'unknown size'
+# instead of using MAX_INT, which doesn't work anymore.
+
+# CHECK: S2_storeri_io %1, 0, %{{[0-9]+}} :: (store (s32)
+# CHECK: S2_storeri_io %1, 0, %{{[0-9]+}} :: (store unknown-size
+
+...
+---
+name: test
+alignment: 4
+tracksRegLiveness: true
+body: |
+ bb.0:
+ successors: %bb.2(0x80000000)
+
+ %2:intregs = A2_tfrsi 0
+ %3:intregs = IMPLICIT_DEF
+ %5:intregs = IMPLICIT_DEF
+ %6:intregs = COPY %5
+ J2_loop0r %bb.2, %6, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+ J2_jump %bb.2, implicit-def dead $pc
+
+ bb.1:
+ successors:
+
+ bb.2:
+ successors: %bb.2(0x7fffffff), %bb.1(0x00000001)
+
+ %0:intregs = PHI %2, %bb.0, %1, %bb.2
+ %1:intregs = A2_addi %0, 1
+ S2_storeri_io %3, 0, %1 :: (store 4 into `i32* undef`)
+ ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+ J2_jump %bb.1, implicit-def dead $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/miunroll-optimize-memrefs1.ll b/llvm/test/CodeGen/Hexagon/miunroll-optimize-memrefs1.ll
new file mode 100644
index 0000000000000..22c8d32e60b64
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/miunroll-optimize-memrefs1.ll
@@ -0,0 +1,123 @@
+; RUN: llc -O3 -march=hexagon -enable-machine-unroller=false < %s | FileCheck --check-prefix=CHECK-NO-UNROLL %s
+; RUN: llc -O3 -march=hexagon -enable-machine-unroller=true -enable-timing-class-latency=false -enable-bsb-sched=true < %s | FileCheck --check-prefix=CHECK-UNROLL %s
+
+; correctness test for machine unroller
+
+; bsb generates 1.5 packets per loop iteration and better hides load latency with 2x unrolling
+; LBB0_8:
+; {
+; r12 = sfadd(r12,r9)
+; r9 = sfmpy(r6,r7)
+; r13 = memw(r3+#0)
+; r14 = memw(r5+#0)
+; }
+; {
+; r3 = add(r3,#256)
+; r5 = add(r5,#256)
+; r6 = memw(r3+#128)
+; r7 = memw(r5+#128)
+; }
+; {
+; r12 = sfadd(r12,r8)
+; r8 = sfmpy(r13,r14)
+;
+; } :endloop0
+; b2b generates 2 packets per loop iteration and does not unroll
+;.LBB0_8:
+; {
+; r0 = sfadd(r0,r5)
+; r6 = sfmpy(r3,r4)
+; r5 = r6
+; r3 = memw(r7+#0)
+; }
+; {
+; r7 = add(r7,#128)
+; r8 = add(r8,#128)
+; r4 = memw(r8+#0)
+; } :endloop0
+; create b2b bug
+
+
+; Without the machine unroller, make sure that the inner most loop has only one sfmpy instruction.
+
+; CHECK-NO-UNROLL: loop0(.LBB0_[[LOOP:.*]],
+; CHECK-NO-UNROLL: if ({{.*}}p{{[0-3]}}) jump{{.*}} .LBB0_{{.*}}
+; CHECK-NO-UNROLL: .LBB0_[[LOOP]]:
+; CHECK-NO-UNROLL-DAG: {
+; CHECK-NO-UNROLL-DAG: sfmpy
+; CHECK-NO-UNROLL-NOT: sfmpy
+; CHECK-NO-UNROLL: endloop0
+; CHECK-NO-UNROLL-NOT: loop0
+
+; When the machine unroller is enabled, the inner most loop in the test
+; gets unrolled by 2. Make sure that there are only 3 packets and
+; 2 sfmpy instructions (one for each loop iteration) in the unrolled loop.
+
+; CHECK-UNROLL: loop0(.LBB0_[[LOOP:.]]
+; CHECK-UNROLL: .LBB0_[[LOOP]]:
+; CHECK-UNROLL: sfmpy
+; CHECK-UNROLL: sfmpy
+; CHECK-UNROLL-NOT: sfmpy
+; CHECK-UNROLL: } :endloop0
+
+%struct.loops_params_s = type { i32, i32, i32, i32, i32, i32, i32, [32 x i32], [32 x i32], i32, i32, i32, i32, i32, ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, float, ptr, ptr, i32 }
+
+define float @inner_product(ptr %p) {
+entry:
+ %v = getelementptr inbounds %struct.loops_params_s, ptr %p, i32 0, i32 17
+ %0 = load ptr, ptr %v, align 4
+ %1 = load ptr, ptr %0, align 4
+ %arrayidx2 = getelementptr inbounds ptr, ptr %0, i32 1
+ %2 = load ptr, ptr %arrayidx2, align 4
+ %N = getelementptr inbounds %struct.loops_params_s, ptr %p, i32 0, i32 5
+ %3 = load i32, ptr %N, align 4
+ %Loop = getelementptr inbounds %struct.loops_params_s, ptr %p, i32 0, i32 9
+ %4 = load i32, ptr %Loop, align 4
+ %vsize = getelementptr inbounds %struct.loops_params_s, ptr %p, i32 0, i32 1
+ %5 = load i32, ptr %vsize, align 4
+ %call = tail call i32 @reinit_vec(ptr %p, ptr %1, i32 %5)
+ %6 = load i32, ptr %vsize, align 4
+ %call4 = tail call i32 @reinit_vec(ptr %p, ptr %2, i32 %6)
+ %cmp39 = icmp slt i32 %4, 1
+ br i1 %cmp39, label %for.end13, label %for.body.lr.ph
+
+for.body.lr.ph: ; preds = %entry
+ %cmp636 = icmp sgt i32 %3, 0
+ br label %for.body
+
+for.body: ; preds = %for.inc11, %for.body.lr.ph
+ %q.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %q.1.lcssa, %for.inc11 ]
+ %l.040 = phi i32 [ 1, %for.body.lr.ph ], [ %inc12, %for.inc11 ]
+ br i1 %cmp636, label %for.body7.lr.ph, label %for.inc11
+
+for.body7.lr.ph: ; preds = %for.body
+ %arrayidx8.gep = getelementptr float, ptr %2, i32 %l.040
+ br label %for.body7
+
+for.body7: ; preds = %for.body7, %for.body7.lr.ph
+ %q.138 = phi float [ %q.042, %for.body7.lr.ph ], [ %add10, %for.body7 ]
+ %arrayidx8.phi = phi ptr [ %arrayidx8.gep, %for.body7.lr.ph ], [ %arrayidx8.inc, %for.body7 ]
+ %arrayidx9.phi = phi ptr [ %1, %for.body7.lr.ph ], [ %arrayidx9.inc, %for.body7 ]
+ %k.037 = phi i32 [ 0, %for.body7.lr.ph ], [ %inc, %for.body7 ]
+ %7 = load float, ptr %arrayidx8.phi, align 4
+ %8 = load float, ptr %arrayidx9.phi, align 4
+ %mul = fmul float %7, %8
+ %add10 = fadd float %q.138, %mul
+ %inc = add nuw nsw i32 %k.037, 1
+ %exitcond = icmp eq i32 %inc, %3
+ %arrayidx8.inc = getelementptr float, ptr %arrayidx8.phi, i32 32
+ %arrayidx9.inc = getelementptr float, ptr %arrayidx9.phi, i32 32
+ br i1 %exitcond, label %for.inc11, label %for.body7
+
+for.inc11: ; preds = %for.body7, %for.body
+ %q.1.lcssa = phi float [ %q.042, %for.body ], [ %add10, %for.body7 ]
+ %inc12 = add nuw nsw i32 %l.040, 1
+ %exitcond44 = icmp eq i32 %l.040, %4
+ br i1 %exitcond44, label %for.end13, label %for.body
+
+for.end13: ; preds = %for.inc11, %entry
+ %q.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %q.1.lcssa, %for.inc11 ]
+ ret float %q.0.lcssa
+}
+
+declare i32 @reinit_vec(...) local_unnamed_addr
diff --git a/llvm/test/CodeGen/Hexagon/miunroll-selfdependency.ll b/llvm/test/CodeGen/Hexagon/miunroll-selfdependency.ll
new file mode 100644
index 0000000000000..4a8fe89c505b1
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/miunroll-selfdependency.ll
@@ -0,0 +1,25 @@
+; RUN: llc -O3 -march=hexagon -debug-only=machine-unroller < %s 2>&1 |\
+; RUN: FileCheck %s
+; The test checks that we don't unroll the loop if we detect there is a self
+; dependency between instructions across loop iterations that cannot be removed
+; and ResMII=1
+; CHECK: Self Dependencies Found. Using unroll factor = 1
+
+; Function Attrs: norecurse nounwind readnone
+define dso_local i32 @fac(i32 %n) local_unnamed_addr {
+entry:
+ %cmp5 = icmp sgt i32 %n, 0
+ br i1 %cmp5, label %while.body, label %while.end
+
+while.body: ; preds = %entry, %while.body
+ %f.07 = phi i32 [ %mul, %while.body ], [ 1, %entry ]
+ %n.addr.06 = phi i32 [ %dec, %while.body ], [ %n, %entry ]
+ %mul = mul nsw i32 %f.07, %n.addr.06
+ %dec = add nsw i32 %n.addr.06, -1
+ %cmp = icmp sgt i32 %dec, 0
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end: ; preds = %while.body, %entry
+ %f.0.lcssa = phi i32 [ 1, %entry ], [ %mul, %while.body ]
+ ret i32 %f.0.lcssa
+}
diff --git a/llvm/test/CodeGen/Hexagon/miunroll-selfdependency2.ll b/llvm/test/CodeGen/Hexagon/miunroll-selfdependency2.ll
new file mode 100644
index 0000000000000..f30b78c0e9fb4
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/miunroll-selfdependency2.ll
@@ -0,0 +1,54 @@
+; RUN: llc -O3 -march=hexagon -debug-only=machine-unroller < %s 2>&1 |\
+; RUN: FileCheck %s
+
+; CHECK: Using unroll factor of 2
+
+%struct.loop_params_s = type { i32, ptr, i32 }
+
+define dso_local float @foo(ptr nocapture readonly %in) local_unnamed_addr {
+entry:
+ %0 = load i32, ptr %in, align 4
+ %n1 = getelementptr inbounds %struct.loop_params_s, ptr %in, i32 0, i32 2
+ %1 = load i32, ptr %n1, align 4
+ %v = getelementptr inbounds %struct.loop_params_s, ptr %in, i32 0, i32 1
+ %2 = load ptr, ptr %v, align 4
+ %3 = load ptr, ptr %2, align 4
+ %cmp31 = icmp sgt i32 %0, 0
+ %cmp528 = icmp sgt i32 %1, 0
+ %or.cond = and i1 %cmp31, %cmp528
+ br i1 %or.cond, label %for.cond4.preheader.us.preheader, label %for.end12
+
+for.cond4.preheader.us.preheader: ; preds = %entry
+ %arrayidx3 = getelementptr inbounds ptr, ptr %2, i32 1
+ %4 = load ptr, ptr %arrayidx3, align 4
+ br label %for.cond4.preheader.us
+
+for.cond4.preheader.us: ; preds = %for.cond4.for.inc10_crit_edge.us, %for.cond4.preheader.us.preheader
+ %q.033.us = phi float [ %add9.us, %for.cond4.for.inc10_crit_edge.us ], [ 0.000000e+00, %for.cond4.preheader.us.preheader ]
+ %i.032.us = phi i32 [ %inc11.us, %for.cond4.for.inc10_crit_edge.us ], [ 0, %for.cond4.preheader.us.preheader ]
+ br label %for.body6.us
+
+for.body6.us: ; preds = %for.body6.us, %for.cond4.preheader.us
+ %q.130.us = phi float [ %q.033.us, %for.cond4.preheader.us ], [ %add9.us, %for.body6.us ]
+ %.pn = phi ptr [ %4, %for.cond4.preheader.us ], [ %arrayidx7.us.phi, %for.body6.us ]
+ %arrayidx8.us.phi = phi ptr [ %3, %for.cond4.preheader.us ], [ %arrayidx8.us.inc, %for.body6.us ]
+ %k.029.us = phi i32 [ 0, %for.cond4.preheader.us ], [ %add.us, %for.body6.us ]
+ %arrayidx7.us.phi = getelementptr float, ptr %.pn, i32 1
+ %add.us = add nuw nsw i32 %k.029.us, 1
+ %5 = load float, ptr %arrayidx7.us.phi, align 4
+ %6 = load float, ptr %arrayidx8.us.phi, align 4
+ %mul.us = fmul float %5, %6
+ %add9.us = fadd float %q.130.us, %mul.us
+ %exitcond = icmp eq i32 %add.us, %1
+ %arrayidx8.us.inc = getelementptr float, ptr %arrayidx8.us.phi, i32 1
+ br i1 %exitcond, label %for.cond4.for.inc10_crit_edge.us, label %for.body6.us
+
+for.cond4.for.inc10_crit_edge.us: ; preds = %for.body6.us
+ %inc11.us = add nuw nsw i32 %i.032.us, 1
+ %exitcond36 = icmp eq i32 %inc11.us, %0
+ br i1 %exitcond36, label %for.end12, label %for.cond4.preheader.us
+
+for.end12: ; preds = %for.cond4.for.inc10_crit_edge.us, %entry
+ %q.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add9.us, %for.cond4.for.inc10_crit_edge.us ]
+ ret float %q.0.lcssa
+}
diff --git a/llvm/test/CodeGen/Hexagon/miunroll-update-memoperands.ll b/llvm/test/CodeGen/Hexagon/miunroll-update-memoperands.ll
new file mode 100644
index 0000000000000..24dd384f34fcb
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/miunroll-update-memoperands.ll
@@ -0,0 +1,56 @@
+; RUN: llc -march=hexagon -O3 -enable-machine-unroller=true < %s
+; REQUIRES: asserts
+
+; This test used to fail with an "UNREACHABLE" executed in Machine Unroller due to a bug
+; in computeDelta function.
+
+%class.mrObjectRecord = type { i32, i32, %class.mrSurfaceList, i32, i32, i32, i32, i32, i32 }
+%class.mrSurfaceList = type { %class.ggSolidTexture, %class.ggTrain }
+%class.ggSolidTexture = type { ptr }
+%class.ggTrain = type { ptr, i32, i32 }
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @_Znaj() local_unnamed_addr
+
+declare dso_local fastcc ptr @_ZN12ggDictionaryI14mrObjectRecordE6lookUpERK8ggString() unnamed_addr align 2
+
+define dso_local fastcc void @_ZN7mrScene9AddObjectEP9mrSurfaceRK8ggStringS4_i() unnamed_addr align 2 personality ptr @__gxx_personality_v0 {
+entry:
+ br i1 undef, label %_ZN12ggDictionaryI10ggMaterialE6lookUpERK8ggString.exit, label %while.body.i.i.lr.ph
+
+while.body.i.i.lr.ph: ; preds = %entry
+ unreachable
+
+_ZN12ggDictionaryI10ggMaterialE6lookUpERK8ggString.exit: ; preds = %entry
+ %call5 = tail call fastcc ptr @_ZN12ggDictionaryI14mrObjectRecordE6lookUpERK8ggString()
+ br i1 undef, label %if.then7, label %if.end11
+
+if.then7: ; preds = %_ZN12ggDictionaryI10ggMaterialE6lookUpERK8ggString.exit
+ invoke void @_Znaj()
+ to label %invoke.cont unwind label %lpad
+
+invoke.cont: ; preds = %if.then7
+ br label %if.end11
+
+lpad: ; preds = %if.then7
+ %0 = landingpad { ptr, i32 }
+ cleanup
+ resume { ptr, i32 } %0
+
+if.end11: ; preds = %invoke.cont, %_ZN12ggDictionaryI10ggMaterialE6lookUpERK8ggString.exit
+ %surfaces.i.i7 = getelementptr inbounds %class.mrObjectRecord, ptr %call5, i32 0, i32 2, i32 1
+ br label %for.body.i.i.i
+
+for.cond.cleanup.i.i.i: ; preds = %for.body.i.i.i
+ ret void
+
+for.body.i.i.i: ; preds = %for.body.i.i.i, %if.end11
+ %i.0.i.i.i52 = phi i32 [ %inc.i.i.i, %for.body.i.i.i ], [ 0, %if.end11 ]
+ %1 = load i32, ptr undef, align 4
+ %2 = load ptr, ptr %surfaces.i.i7, align 4
+ %arrayidx9.i.i.i = getelementptr inbounds ptr, ptr %2, i32 %i.0.i.i.i52
+ store i32 %1, ptr %arrayidx9.i.i.i, align 4
+ %inc.i.i.i = add nuw nsw i32 %i.0.i.i.i52, 1
+ br i1 false, label %for.body.i.i.i, label %for.cond.cleanup.i.i.i
+}
diff --git a/llvm/test/CodeGen/Hexagon/miunroll-valign.ll b/llvm/test/CodeGen/Hexagon/miunroll-valign.ll
new file mode 100644
index 0000000000000..1a69b6fce204c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/miunroll-valign.ll
@@ -0,0 +1,82 @@
+; RUN: llc -O3 -march=hexagon -enable-machine-unroller=true < %s | FileCheck %s
+; REQUIRES: asserts
+
+; This test used to assert because machine unroller copied valign instruction
+; with the third parameter as IntRegs instead of IntRegsLow8. Instruction valign
+; requires its third parameter to have IntRegsLow8 type.
+
+; CHECK: loop0(
+; CHECK: valign
+; CHECK: valign
+; CHECK: endloop0
+
+ at a = common dso_local local_unnamed_addr global i32 0, align 4
+ at b = common dso_local local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: nofree nosync nounwind
+define dso_local i32 @c() #0 {
+entry:
+ %0 = load i32, ptr @b, align 4
+ %1 = load i32, ptr @a, align 4
+ %2 = icmp slt i32 %1, %0
+ %cmp = icmp ult ptr inttoptr (i32 2 to ptr), @c
+ %or.cond = select i1 %cmp , i1 %2, i1 false
+ br i1 %or.cond, label %for.cond1.preheader.us.preheader, label %for.end3
+
+for.cond1.preheader.us.preheader: ; preds = %entry
+ %3 = shl i32 %0, 2
+ br label %for.cond1.preheader.us
+
+for.cond1.preheader.us: ; preds = %for.cond.loopexit.us, %for.cond1.preheader.us.preheader
+ %4 = phi i32 [ %9, %for.cond.loopexit.us ], [ %1, %for.cond1.preheader.us.preheader ]
+ %e.019.us = phi i32 [ %e.1.lcssa.us, %for.cond.loopexit.us ], [ undef, %for.cond1.preheader.us.preheader ]
+ %k.018.us = phi <32 x i32> [ %k.1.lcssa.us, %for.cond.loopexit.us ], [ undef, %for.cond1.preheader.us.preheader ]
+ %cmp14.us = icmp slt i32 %4, %0
+ br i1 %cmp14.us, label %for.body2.us.preheader, label %for.cond.loopexit.us
+
+for.body2.us.preheader: ; preds = %for.cond1.preheader.us
+ %.neg = mul i32 %4, -4
+ br label %for.body2.us
+
+for.body2.us: ; preds = %for.body2.us, %for.body2.us.preheader
+ %5 = phi i32 [ %inc.us, %for.body2.us ], [ %4, %for.body2.us.preheader ]
+ %e.116.us = phi i32 [ %add.us, %for.body2.us ], [ %e.019.us, %for.body2.us.preheader ]
+ %k.115.us = phi <32 x i32> [ %8, %for.body2.us ], [ %k.018.us, %for.body2.us.preheader ]
+ %6 = tail call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> undef, <32 x i32> undef, i32 %e.116.us)
+ %add.us = add nsw i32 %e.116.us, 4
+ %7 = tail call <32 x i32> @llvm.hexagon.V6.vmpy.qf32.sf.128B(<32 x i32> %6, <32 x i32> undef)
+ %8 = tail call <32 x i32> @llvm.hexagon.V6.vadd.qf32.128B(<32 x i32> %k.115.us, <32 x i32> %7)
+ %inc.us = add nsw i32 %5, 1
+ %exitcond.not = icmp eq i32 %inc.us, %0
+ br i1 %exitcond.not, label %for.cond1.for.cond.loopexit_crit_edge.us, label %for.body2.us
+
+for.cond.loopexit.us: ; preds = %for.cond1.for.cond.loopexit_crit_edge.us, %for.cond1.preheader.us
+ %9 = phi i32 [ %0, %for.cond1.for.cond.loopexit_crit_edge.us ], [ %4, %for.cond1.preheader.us ]
+ %k.1.lcssa.us = phi <32 x i32> [ %8, %for.cond1.for.cond.loopexit_crit_edge.us ], [ %k.018.us, %for.cond1.preheader.us ]
+ %e.1.lcssa.us = phi i32 [ %11, %for.cond1.for.cond.loopexit_crit_edge.us ], [ %e.019.us, %for.cond1.preheader.us ]
+ %cmp2 = icmp ult ptr inttoptr (i32 2 to ptr), @c
+ br i1 %cmp2, label %for.cond1.preheader.us, label %for.end3
+
+for.cond1.for.cond.loopexit_crit_edge.us: ; preds = %for.body2.us
+ %10 = add i32 %3, %e.019.us
+ %11 = add i32 %.neg, %10
+ store i32 %0, ptr @a, align 4
+ br label %for.cond.loopexit.us
+
+for.end3: ; preds = %for.cond.loopexit.us, %entry
+ %k.0.lcssa = phi <32 x i32> [ undef, %entry ], [ %k.1.lcssa.us, %for.cond.loopexit.us ]
+ store <32 x i32> %k.0.lcssa, ptr inttoptr (i32 2 to ptr), align 128
+ ret i32 undef
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32>, <32 x i32>, i32) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vmpy.qf32.sf.128B(<32 x i32>, <32 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vadd.qf32.128B(<32 x i32>, <32 x i32>) #1
+
+attributes #0 = { nofree nosync nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+hvx-length128b,+hvxv68,+v68,-long-calls" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/Hexagon/miunroll.ll b/llvm/test/CodeGen/Hexagon/miunroll.ll
new file mode 100644
index 0000000000000..eda099ee45d94
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/miunroll.ll
@@ -0,0 +1,52 @@
+; RUN: llc -O3 -march=hexagon -enable-machine-unroller=false < %s | FileCheck --check-prefix=CHECK-NO-UNROLL %s
+; RUN: llc -O3 -march=hexagon -enable-machine-unroller=true < %s | FileCheck --check-prefix=CHECK-UNROLL %s
+
+; Make sure that there's only one hardware loop when the machine unroller is disabled.
+; CHECK-NO-UNROLL: loop0(.LBB0_[[LOOP:.]]
+; CHECK-NO-UNROLL: .LBB0_[[LOOP]]:
+; CHECK-NO-UNROLL: sfmpy
+; CHECK-NO-UNROLL-NOT: sfmpy
+; CHECK-NO-UNROLL: endloop0
+; CHECK-NO-UNROLL-NOT: loop0
+
+; Make sure that there are multiple hardware loops when the machine unroller is enabled, one for the unrolled loop and another for the remainder loop.
+; CHECK-UNROLL: loop0(.LBB0_[[LOOP:.]]
+; CHECK-UNROLL: .LBB0_[[LOOP]]:
+; CHECK-UNROLL: sfmpy
+; CHECK-UNROLL: sfmpy
+; CHECK-UNROLL: endloop0
+
+define float @test(i32 %n, float %da, float* noalias nocapture readonly %dx, i32 %incx, float* noalias nocapture %dy, i32 %incy) local_unnamed_addr {
+entry:
+ %cmp = icmp slt i32 %n, 1
+ %cmp1 = fcmp oeq float %da, 0.000000e+00
+ %or.cond45 = or i1 %cmp, %cmp1
+ br i1 %or.cond45, label %if.then6, label %if.end3
+
+if.end3:
+ %cmp4 = icmp ne i32 %incx, 1
+ %cmp5 = icmp ne i32 %incy, 1
+ %or.cond = or i1 %cmp4, %cmp5
+ br i1 %or.cond, label %if.then6, label %for.body.lr.ph
+
+if.then6:
+ ret float 0.000000e+00
+
+for.body.lr.ph:
+ %0 = load float, float* %dy, align 4
+ br label %for.body
+
+for.body:
+ %arrayidx18.phi = phi float* [ %dx, %for.body.lr.ph ], [ %arrayidx18.inc, %for.body ]
+ %arrayidx21.phi = phi float* [ %dy, %for.body.lr.ph ], [ %arrayidx21.inc, %for.body ]
+ %i.047 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+ %1 = load float, float* %arrayidx18.phi, align 4
+ %mul19 = fmul float %1, %da
+ %add20 = fadd float %0, %mul19
+ store float %add20, float* %arrayidx21.phi, align 4
+ %inc = add nuw nsw i32 %i.047, 1
+ %exitcond = icmp eq i32 %inc, %n
+ %arrayidx18.inc = getelementptr float, float* %arrayidx18.phi, i32 32
+ %arrayidx21.inc = getelementptr float, float* %arrayidx21.phi, i32 32
+ br i1 %exitcond, label %if.then6, label %for.body
+}
More information about the llvm-commits
mailing list