[llvm] [Hexagon] Add Loop Alignment pass. (PR #83328)
Sumanth Gundapaneni via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 28 12:27:31 PST 2024
https://github.com/sgundapa updated https://github.com/llvm/llvm-project/pull/83328
>From 5aaa5049b837b68b4d8528184e9d2a65e0e63074 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sgundapa at quicinc.com>
Date: Wed, 28 Feb 2024 12:23:35 -0800
Subject: [PATCH] [Hexagon] Add Loop Alignment pass.
Inspect a basic block and if its single basic block loop with a small
number of instructions, set the Loop Alignment to 32 bytes.
This will avoid the cache line break in the first packet of
loop which will cause a stall per each execution of loop.
---
llvm/lib/Target/Hexagon/CMakeLists.txt | 1 +
llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp | 205 ++++++++++++++++++
.../Target/Hexagon/HexagonTargetMachine.cpp | 17 +-
.../lib/Target/Hexagon/HexagonTargetMachine.h | 4 +
llvm/test/CodeGen/Hexagon/loop-balign.ll | 91 ++++++++
llvm/test/CodeGen/Hexagon/loop_align_count.ll | 115 ++++++++++
.../test/CodeGen/Hexagon/loop_align_count.mir | 130 +++++++++++
llvm/test/CodeGen/Hexagon/v6-haar-balign32.ll | 117 ++++++++++
8 files changed, 679 insertions(+), 1 deletion(-)
create mode 100644 llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp
create mode 100644 llvm/test/CodeGen/Hexagon/loop-balign.ll
create mode 100644 llvm/test/CodeGen/Hexagon/loop_align_count.ll
create mode 100644 llvm/test/CodeGen/Hexagon/loop_align_count.mir
create mode 100644 llvm/test/CodeGen/Hexagon/v6-haar-balign32.ll
diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index a22a5c11e6ab3a..cdc062eee72b1e 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -43,6 +43,7 @@ add_llvm_target(HexagonCodeGen
HexagonISelDAGToDAGHVX.cpp
HexagonISelLowering.cpp
HexagonISelLoweringHVX.cpp
+ HexagonLoopAlign.cpp
HexagonLoopIdiomRecognition.cpp
HexagonMachineFunctionInfo.cpp
HexagonMachineScheduler.cpp
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp b/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp
new file mode 100644
index 00000000000000..40848f7fa069ed
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp
@@ -0,0 +1,205 @@
+//===----- HexagonLoopAlign.cpp - Generate loop alignment directives -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Inspect a basic block and if its single basic block loop with a small
+// number of instructions, set the prefLoopAlignment to 32 bytes (5).
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-loop-align"
+
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+ DisableLoopAlign("disable-hexagon-loop-align", cl::Hidden,
+ cl::desc("Disable Hexagon loop alignment pass"));
+
+static cl::opt<uint32_t> HVXLoopAlignLimitUB(
+ "hexagon-hvx-loop-align-limit-ub", cl::Hidden, cl::init(16),
+ cl::desc("Set hexagon hvx loop upper bound align limit"));
+
+static cl::opt<uint32_t> TinyLoopAlignLimitUB(
+ "hexagon-tiny-loop-align-limit-ub", cl::Hidden, cl::init(16),
+ cl::desc("Set hexagon tiny-core loop upper bound align limit"));
+
+static cl::opt<uint32_t>
+ LoopAlignLimitUB("hexagon-loop-align-limit-ub", cl::Hidden, cl::init(8),
+ cl::desc("Set hexagon loop upper bound align limit"));
+
+static cl::opt<uint32_t>
+ LoopAlignLimitLB("hexagon-loop-align-limit-lb", cl::Hidden, cl::init(4),
+ cl::desc("Set hexagon loop lower bound align limit"));
+
+static cl::opt<uint32_t>
+ LoopBndlAlignLimit("hexagon-loop-bundle-align-limit", cl::Hidden,
+ cl::init(4),
+ cl::desc("Set hexagon loop align bundle limit"));
+
+static cl::opt<uint32_t> TinyLoopBndlAlignLimit(
+ "hexagon-tiny-loop-bundle-align-limit", cl::Hidden, cl::init(8),
+ cl::desc("Set hexagon tiny-core loop align bundle limit"));
+
+static cl::opt<uint32_t>
+ LoopEdgeThreshold("hexagon-loop-edge-threshold", cl::Hidden, cl::init(7500),
+ cl::desc("Set hexagon loop align edge theshold"));
+
+namespace llvm {
+FunctionPass *createHexagonLoopAlign();
+void initializeHexagonLoopAlignPass(PassRegistry &);
+} // namespace llvm
+
+namespace {
+
+class HexagonLoopAlign : public MachineFunctionPass {
+public:
+ static char ID;
+ HexagonLoopAlign() : MachineFunctionPass(ID) {
+ initializeHexagonLoopAlignPass(*PassRegistry::getPassRegistry());
+ }
+ bool shouldBalignLoop(MachineBasicBlock &BB, const HexagonSubtarget *HST,
+ const HexagonInstrInfo *HII, bool AboveThres);
+ bool isSingleLoop(MachineBasicBlock &MBB);
+ bool attemptToBalignSmallLoop(MachineFunction &MF, MachineBasicBlock &MBB);
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineBranchProbabilityInfo>();
+ AU.addRequired<MachineBlockFrequencyInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override { return "Hexagon LoopAlign pass"; }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+char HexagonLoopAlign::ID = 0;
+
+bool HexagonLoopAlign::shouldBalignLoop(MachineBasicBlock &BB,
+ const HexagonSubtarget *HST,
+ const HexagonInstrInfo *HII,
+ bool AboveThres) {
+ bool isVec = false;
+ unsigned InstCnt = 0;
+ unsigned BndlCnt = 0;
+
+ for (MachineBasicBlock::instr_iterator II = BB.instr_begin(),
+ IE = BB.instr_end();
+ II != IE; ++II) {
+
+ // End if the instruction is endloop.
+ if (HII->isEndLoopN(II->getOpcode()))
+ break;
+ // Count the number of bundles.
+ if (II->isBundle()) {
+ BndlCnt++;
+ continue;
+ }
+ // Skip over debug instructions.
+ if (II->isDebugInstr())
+ continue;
+ // Check if there are any HVX instructions in loop.
+ isVec |= HII->isHVXVec(*II);
+ // Count the number of instructions.
+ InstCnt++;
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "Bundle Count : " << BndlCnt << "\n";
+ dbgs() << "Instruction Count : " << InstCnt << "\n";
+ });
+
+ unsigned LimitUB = 0;
+ unsigned LimitBndl = LoopBndlAlignLimit;
+ // The conditions in the order of priority.
+ if (HST->isTinyCore()) {
+ LimitUB = TinyLoopAlignLimitUB;
+ LimitBndl = TinyLoopBndlAlignLimit;
+ } else if (isVec)
+ LimitUB = HVXLoopAlignLimitUB;
+ else if (AboveThres)
+ LimitUB = LoopAlignLimitUB;
+
+ // if the upper bound is not set to a value, implies we didn't meet
+ // the criteria.
+ if (LimitUB == 0)
+ return false;
+
+ return InstCnt >= LoopAlignLimitLB && InstCnt <= LimitUB &&
+ BndlCnt <= LimitBndl;
+}
+
+bool HexagonLoopAlign::isSingleLoop(MachineBasicBlock &MBB) {
+ int Succs = MBB.succ_size();
+ return (MBB.isSuccessor(&MBB) && (Succs == 2));
+}
+
+bool HexagonLoopAlign::attemptToBalignSmallLoop(MachineFunction &MF,
+ MachineBasicBlock &MBB) {
+ if (!isSingleLoop(MBB))
+ return false;
+
+ const auto HST = &MF.getSubtarget<HexagonSubtarget>();
+ const auto HII = HST->getInstrInfo();
+
+ const MachineBranchProbabilityInfo *MBPI =
+ &getAnalysis<MachineBranchProbabilityInfo>();
+ const MachineBlockFrequencyInfo *MBFI =
+ &getAnalysis<MachineBlockFrequencyInfo>();
+
+ // Compute frequency of back edge,
+ BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
+ BranchProbability BrProb = MBPI->getEdgeProbability(&MBB, &MBB);
+ BlockFrequency EdgeFreq = BlockFreq * BrProb;
+ LLVM_DEBUG({
+ dbgs() << "Loop Align Pass:\n";
+ dbgs() << "\tedge with freq(" << EdgeFreq.getFrequency() << ")\n";
+ });
+
+ bool AboveThres = EdgeFreq.getFrequency() > LoopEdgeThreshold;
+ if (shouldBalignLoop(MBB, HST, HII, AboveThres)) {
+ // We found a loop, change its alignment to be 32 (5).
+ MBB.setAlignment(llvm::Align(1 << 5));
+ return true;
+ }
+ return false;
+}
+
+// Inspect each basic block, and if its a single BB loop, see if it
+// meets the criteria for increasing alignment to 32.
+// This optimization is performed at
+// i) -O2 and above, and when the loop has a HVX instruction.
+// ii) -O3
+
+bool HexagonLoopAlign::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+ if (DisableLoopAlign)
+ return false;
+
+ bool Changed = false;
+ for (MachineFunction::iterator MBBi = MF.begin(), MBBe = MF.end();
+ MBBi != MBBe; ++MBBi) {
+ MachineBasicBlock &MBB = *MBBi;
+ Changed |= attemptToBalignSmallLoop(MF, MBB);
+ }
+ return Changed;
+}
+
+} // namespace
+
+INITIALIZE_PASS(HexagonLoopAlign, "hexagon-loop-align",
+ "Hexagon LoopAlign pass", false, false)
+
+//===----------------------------------------------------------------------===//
+// Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonLoopAlign() { return new HexagonLoopAlign(); }
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 7d77286339399d..61188d11d936aa 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -164,6 +164,7 @@ namespace llvm {
void initializeHexagonGenMuxPass(PassRegistry&);
void initializeHexagonHardwareLoopsPass(PassRegistry&);
void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &);
+ void initializeHexagonLoopAlignPass(PassRegistry &);
void initializeHexagonNewValueJumpPass(PassRegistry&);
void initializeHexagonOptAddrModePass(PassRegistry&);
void initializeHexagonPacketizerPass(PassRegistry&);
@@ -194,6 +195,7 @@ namespace llvm {
FunctionPass *createHexagonHardwareLoops();
FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
CodeGenOptLevel OptLevel);
+ FunctionPass *createHexagonLoopAlign();
FunctionPass *createHexagonLoopRescheduling();
FunctionPass *createHexagonNewValueJump();
FunctionPass *createHexagonOptAddrMode();
@@ -256,8 +258,10 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
TT, CPU, FS, Options, getEffectiveRelocModel(RM),
getEffectiveCodeModel(CM, CodeModel::Small),
(HexagonNoOpt ? CodeGenOptLevel::None : OL)),
- TLOF(std::make_unique<HexagonTargetObjectFile>()) {
+ TLOF(std::make_unique<HexagonTargetObjectFile>()),
+ Subtarget(Triple(TT), CPU, FS, *this) {
initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
+ initializeHexagonLoopAlignPass(*PassRegistry::getPassRegistry());
initializeHexagonTfrCleanupPass(*PassRegistry::getPassRegistry());
initAsmInfo();
}
@@ -459,6 +463,8 @@ void HexagonPassConfig::addPreSched2() {
void HexagonPassConfig::addPreEmitPass() {
bool NoOpt = (getOptLevel() == CodeGenOptLevel::None);
+ const HexagonTargetMachine &HTM = getHexagonTargetMachine();
+ const HexagonSubtarget *HST = HTM.getSubtargetImpl();
if (!NoOpt)
addPass(createHexagonNewValueJump());
@@ -476,6 +482,15 @@ void HexagonPassConfig::addPreEmitPass() {
// Packetization is mandatory: it handles gather/scatter at all opt levels.
addPass(createHexagonPacketizer(NoOpt));
+ if (!NoOpt) {
+ // Loop Alignment to 32 for smaller loops. Performed at
+ // i) -O2 and above, and when the loop has HVX instruction.
+ // ii) -O3
+ if ((HTM.getOptLevel() >= CodeGenOptLevel::Default && HST->useHVXOps()) ||
+ HTM.getOptLevel() == CodeGenOptLevel::Aggressive)
+ addPass(createHexagonLoopAlign());
+ }
+
if (EnableVectorPrint)
addPass(createHexagonVectorPrint());
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
index c5fed0cd65a814..c7cb1ddcba203a 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -23,6 +23,7 @@ namespace llvm {
class HexagonTargetMachine : public LLVMTargetMachine {
std::unique_ptr<TargetLoweringObjectFile> TLOF;
+ HexagonSubtarget Subtarget;
mutable StringMap<std::unique_ptr<HexagonSubtarget>> SubtargetMap;
public:
@@ -32,6 +33,9 @@ class HexagonTargetMachine : public LLVMTargetMachine {
std::optional<CodeModel::Model> CM, CodeGenOptLevel OL,
bool JIT);
~HexagonTargetMachine() override;
+ const HexagonSubtarget *getSubtargetImpl() const {
+ return &Subtarget;
+ }
const HexagonSubtarget *getSubtargetImpl(const Function &F) const override;
void registerPassBuilderCallbacks(PassBuilder &PB,
diff --git a/llvm/test/CodeGen/Hexagon/loop-balign.ll b/llvm/test/CodeGen/Hexagon/loop-balign.ll
new file mode 100644
index 00000000000000..9d1f42a4b14b18
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/loop-balign.ll
@@ -0,0 +1,91 @@
+; RUN: llc -march=hexagon -O3 < %s | FileCheck %s -check-prefix=BALIGN
+; BALIGN: .p2align{{.*}}5
+
+; The test for checking the alignment of 'for.body4.for.body4_crit_edge' basic block
+
+define dso_local void @foo(i32 %nCol, i32 %nRow, ptr nocapture %resMat) local_unnamed_addr {
+entry:
+ %shl = shl i32 %nRow, 2
+ %cmp36 = icmp sgt i32 %nRow, 0
+ %0 = add i32 %nCol, -1
+ %.inv = icmp slt i32 %0, 1
+ %1 = select i1 %.inv, i32 1, i32 %nCol
+ br label %Outerloop
+
+Outerloop: ; preds = %for.end7, %entry
+ %r12.0 = phi i32 [ 0, %entry ], [ %inc8, %for.end7 ]
+ %r7_6.0 = phi i64 [ undef, %entry ], [ %r7_6.1.lcssa, %for.end7 ]
+ %r0i.0 = phi i32 [ undef, %entry ], [ %r0i.1.lcssa, %for.end7 ]
+ %r5.0 = phi ptr [ %resMat, %entry ], [ %r5.1.lcssa, %for.end7 ]
+ %r8.0 = phi i32 [ %shl, %entry ], [ %r8.1.lcssa, %for.end7 ]
+ br i1 %cmp36, label %for.body.lr.ph, label %for.end7
+
+for.body.lr.ph: ; preds = %Outerloop
+ %cmp332 = icmp eq i32 %r12.0, 0
+ %exitcond.peel = icmp eq i32 %r12.0, 1
+ br label %for.body
+
+for.body: ; preds = %for.end, %for.body.lr.ph
+ %r8.141 = phi i32 [ %r8.0, %for.body.lr.ph ], [ %add, %for.end ]
+ %r5.140 = phi ptr [ %r5.0, %for.body.lr.ph ], [ %add.ptr, %for.end ]
+ %i.039 = phi i32 [ 0, %for.body.lr.ph ], [ %inc6, %for.end ]
+ %r0i.138 = phi i32 [ %r0i.0, %for.body.lr.ph ], [ %4, %for.end ]
+ %r7_6.137 = phi i64 [ %r7_6.0, %for.body.lr.ph ], [ %r7_6.2.lcssa, %for.end ]
+ %add = add nsw i32 %r8.141, %shl
+ br i1 %cmp332, label %for.end, label %for.body4.peel
+
+for.body4.peel: ; preds = %for.body
+ %r1i.0.in.peel = inttoptr i32 %r8.141 to ptr
+ %r1i.0.peel = load i32, ptr %r1i.0.in.peel, align 4
+ %2 = tail call i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64 %r7_6.137, i32 %r1i.0.peel, i32 %r0i.138)
+ br i1 %exitcond.peel, label %for.end, label %for.body4.preheader.peel.newph
+
+for.body4.preheader.peel.newph: ; preds = %for.body4.peel
+ %r1i.0.in = inttoptr i32 %add to ptr
+ %r1i.0 = load i32, ptr %r1i.0.in, align 4
+ br label %for.body4
+
+for.body4: ; preds = %for.body4.for.body4_crit_edge, %for.body4.preheader.peel.newph
+ %inc.phi = phi i32 [ %inc.0, %for.body4.for.body4_crit_edge ], [ 2, %for.body4.preheader.peel.newph ]
+ %r7_6.233 = phi i64 [ %3, %for.body4.for.body4_crit_edge ], [ %2, %for.body4.preheader.peel.newph ]
+ %3 = tail call i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64 %r7_6.233, i32 %r1i.0, i32 %r0i.138)
+ %exitcond = icmp eq i32 %inc.phi, %r12.0
+ br i1 %exitcond, label %for.end.loopexit, label %for.body4.for.body4_crit_edge
+
+for.body4.for.body4_crit_edge: ; preds = %for.body4
+ %inc.0 = add nuw nsw i32 %inc.phi, 1
+ br label %for.body4
+
+for.end.loopexit: ; preds = %for.body4
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %for.body4.peel, %for.body
+ %r7_6.2.lcssa = phi i64 [ %r7_6.137, %for.body ], [ %2, %for.body4.peel ], [ %3, %for.end.loopexit ]
+ %4 = tail call i32 @llvm.hexagon.S2.clbp(i64 %r7_6.2.lcssa)
+ store i32 %4, ptr %r5.140, align 4
+ %add.ptr = getelementptr inbounds i8, ptr %r5.140, i32 undef
+ %inc6 = add nuw nsw i32 %i.039, 1
+ %exitcond47 = icmp eq i32 %inc6, %nRow
+ br i1 %exitcond47, label %for.end7.loopexit, label %for.body
+
+for.end7.loopexit: ; preds = %for.end
+ br label %for.end7
+
+for.end7: ; preds = %for.end7.loopexit, %Outerloop
+ %r7_6.1.lcssa = phi i64 [ %r7_6.0, %Outerloop ], [ %r7_6.2.lcssa, %for.end7.loopexit ]
+ %r0i.1.lcssa = phi i32 [ %r0i.0, %Outerloop ], [ %4, %for.end7.loopexit ]
+ %r5.1.lcssa = phi ptr [ %r5.0, %Outerloop ], [ %add.ptr, %for.end7.loopexit ]
+ %r8.1.lcssa = phi i32 [ %r8.0, %Outerloop ], [ %add, %for.end7.loopexit ]
+ %inc8 = add nuw i32 %r12.0, 1
+ %exitcond48 = icmp eq i32 %inc8, %1
+ br i1 %exitcond48, label %if.end, label %Outerloop
+
+if.end: ; preds = %for.end7
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64, i32, i32)
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.S2.clbp(i64)
diff --git a/llvm/test/CodeGen/Hexagon/loop_align_count.ll b/llvm/test/CodeGen/Hexagon/loop_align_count.ll
new file mode 100644
index 00000000000000..07d7e4a8d61176
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/loop_align_count.ll
@@ -0,0 +1,115 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv73 -O2 -mattr=+hvxv73,hvx-length64b \
+; RUN: -debug-only=hexagon-loop-align 2>&1 < %s | FileCheck %s
+; Validate that there are 4 bundles in the loop.
+
+; CHECK: Loop Align Pass:
+; CHECK: Bundle Count : 4
+; CHECK: .p2align{{.*}}5
+
+; Function Attrs: nounwind
+define void @ham(ptr noalias nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3, ptr noalias nocapture %arg4, i32 %arg5) #0 {
+bb:
+ %ashr = ashr i32 %arg3, 2
+ %ashr6 = ashr i32 %arg3, 1
+ %add = add nsw i32 %ashr6, %ashr
+ %icmp = icmp sgt i32 %arg2, 0
+ br i1 %icmp, label %bb7, label %bb61
+
+bb7: ; preds = %bb
+ %sdiv = sdiv i32 %arg1, 64
+ %icmp8 = icmp sgt i32 %arg1, 63
+ br label %bb9
+
+bb9: ; preds = %bb57, %bb7
+ %phi = phi i32 [ 0, %bb7 ], [ %add58, %bb57 ]
+ %ashr10 = ashr exact i32 %phi, 1
+ %mul = mul nsw i32 %ashr10, %arg3
+ br i1 %icmp8, label %bb11, label %bb57
+
+bb11: ; preds = %bb9
+ %add12 = add nsw i32 %phi, 1
+ %mul13 = mul nsw i32 %add12, %arg5
+ %mul14 = mul nsw i32 %phi, %arg5
+ %add15 = add i32 %add, %mul
+ %add16 = add i32 %mul, %ashr
+ %add17 = add i32 %mul, %ashr6
+ %getelementptr = getelementptr inbounds i8, ptr %arg4, i32 %mul13
+ %getelementptr18 = getelementptr inbounds i8, ptr %arg4, i32 %mul14
+ %getelementptr19 = getelementptr inbounds i16, ptr %arg, i32 %add15
+ %getelementptr20 = getelementptr inbounds i16, ptr %arg, i32 %add16
+ %getelementptr21 = getelementptr inbounds i16, ptr %arg, i32 %add17
+ %getelementptr22 = getelementptr inbounds i16, ptr %arg, i32 %mul
+ %bitcast = bitcast ptr %getelementptr to ptr
+ %bitcast23 = bitcast ptr %getelementptr18 to ptr
+ %bitcast24 = bitcast ptr %getelementptr19 to ptr
+ %bitcast25 = bitcast ptr %getelementptr20 to ptr
+ %bitcast26 = bitcast ptr %getelementptr21 to ptr
+ %bitcast27 = bitcast ptr %getelementptr22 to ptr
+ br label %bb28
+
+bb28: ; preds = %bb28, %bb11
+ %phi29 = phi i32 [ 0, %bb11 ], [ %add54, %bb28 ]
+ %phi30 = phi ptr [ %bitcast27, %bb11 ], [ %getelementptr36, %bb28 ]
+ %phi31 = phi ptr [ %bitcast26, %bb11 ], [ %getelementptr37, %bb28 ]
+ %phi32 = phi ptr [ %bitcast25, %bb11 ], [ %getelementptr39, %bb28 ]
+ %phi33 = phi ptr [ %bitcast24, %bb11 ], [ %getelementptr41, %bb28 ]
+ %phi34 = phi ptr [ %bitcast, %bb11 ], [ %getelementptr53, %bb28 ]
+ %phi35 = phi ptr [ %bitcast23, %bb11 ], [ %getelementptr52, %bb28 ]
+ %getelementptr36 = getelementptr inbounds <16 x i32>, ptr %phi30, i32 1
+ %load = load <16 x i32>, ptr %phi30, align 64
+ %getelementptr37 = getelementptr inbounds <16 x i32>, ptr %phi31, i32 1
+ %load38 = load <16 x i32>, ptr %phi31, align 64
+ %getelementptr39 = getelementptr inbounds <16 x i32>, ptr %phi32, i32 1
+ %load40 = load <16 x i32>, ptr %phi32, align 64
+ %getelementptr41 = getelementptr inbounds <16 x i32>, ptr %phi33, i32 1
+ %load42 = load <16 x i32>, ptr %phi33, align 64
+ %call = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load, <16 x i32> %load38)
+ %call43 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load, <16 x i32> %load38)
+ %call44 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load40, <16 x i32> %load42)
+ %call45 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load40, <16 x i32> %load42)
+ %call46 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call, <16 x i32> %call44)
+ %call47 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call, <16 x i32> %call44)
+ %call48 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call43, <16 x i32> %call45)
+ %call49 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call43, <16 x i32> %call45)
+ %call50 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call47, <16 x i32> %call46)
+ %call51 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call49, <16 x i32> %call48)
+ %getelementptr52 = getelementptr inbounds <16 x i32>, ptr %phi35, i32 1
+ store <16 x i32> %call50, ptr %phi35, align 64
+ %getelementptr53 = getelementptr inbounds <16 x i32>, ptr %phi34, i32 1
+ store <16 x i32> %call51, ptr %phi34, align 64
+ %add54 = add nsw i32 %phi29, 1
+ %icmp55 = icmp slt i32 %add54, %sdiv
+ br i1 %icmp55, label %bb28, label %bb56
+
+bb56: ; preds = %bb28
+ br label %bb57
+
+bb57: ; preds = %bb56, %bb9
+ %add58 = add nsw i32 %phi, 2
+ %icmp59 = icmp slt i32 %add58, %arg2
+ br i1 %icmp59, label %bb9, label %bb60
+
+bb60: ; preds = %bb57
+ br label %bb61
+
+bb61: ; preds = %bb60, %bb
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32>, <16 x i32>) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/Hexagon/loop_align_count.mir b/llvm/test/CodeGen/Hexagon/loop_align_count.mir
new file mode 100644
index 00000000000000..b08dcbaeb56609
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/loop_align_count.mir
@@ -0,0 +1,130 @@
+# RUN: llc -march=hexagon -run-pass hexagon-loop-align -o - %s\
+# RUN: -debug-only=hexagon-loop-align -verify-machineinstrs 2>&1 | FileCheck %s
+
+# Test that we only count til endloop instruction and we align this
+# loop to 32.
+# CHECK: Loop Align Pass:
+# CHECK: Instruction Count : 16
+# CHECK: bb.5 (align 32)
+---
+name: fred
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ successors: %bb.1(0x50000000), %bb.8(0x30000000)
+ liveins: $r0, $r1, $r2, $r3, $r4, $r5
+
+ renamable $p0 = C2_cmpgti renamable $r2, 0
+ J2_jumpf killed renamable $p0, %bb.8, implicit-def dead $pc
+ J2_jump %bb.1, implicit-def dead $pc
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+ liveins: $r0, $r1, $r2, $r3, $r4, $r5
+
+ renamable $r7 = A2_addi killed renamable $r2, 1
+ renamable $r8 = S2_asr_i_r renamable $r1, 31
+ renamable $p0 = C2_cmpgti renamable $r1, 63
+ renamable $r2 = S2_asr_i_r renamable $r3, 2
+ renamable $r6 = S2_asr_i_r renamable $r3, 1
+ renamable $r9 = S2_lsr_i_r killed renamable $r7, 1
+ renamable $r1 = S2_lsr_i_r_acc killed renamable $r1, killed renamable $r8, 26
+ renamable $r7 = A2_tfrsi 0
+ renamable $r1 = S2_asr_i_r killed renamable $r1, 6
+ J2_loop1r %bb.2, killed renamable $r9, implicit-def $lc1, implicit-def $sa1
+ renamable $r8 = nsw A2_add renamable $r6, renamable $r2
+
+ bb.2:
+ successors: %bb.3(0x40000000), %bb.7(0x40000000)
+ liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8
+
+ J2_jumpf renamable $p0, %bb.7, implicit-def dead $pc
+ J2_jump %bb.3, implicit-def dead $pc
+
+ bb.3:
+ successors: %bb.4(0x80000000)
+ liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8
+
+ renamable $r13 = exact S2_asr_i_r renamable $r7, 1
+ renamable $r12 = COPY renamable $r4
+ renamable $r9 = COPY renamable $r4
+ renamable $r14 = nsw A2_addi renamable $r7, 1
+ renamable $r15 = nsw M2_mpyi killed renamable $r13, renamable $r3
+ renamable $r9 = M2_maci killed renamable $r9, killed renamable $r14, renamable $r5
+ renamable $r13 = A2_add renamable $r8, renamable $r15
+ renamable $r28 = A2_add renamable $r15, renamable $r2
+ renamable $r10 = A2_add renamable $r15, renamable $r6
+ renamable $r12 = M2_maci killed renamable $r12, renamable $r7, renamable $r5
+ renamable $r13 = S2_addasl_rrri renamable $r0, killed renamable $r13, 1
+ renamable $r14 = S2_addasl_rrri renamable $r0, killed renamable $r15, 1
+ renamable $r15 = S2_addasl_rrri renamable $r0, killed renamable $r28, 1
+ renamable $r28 = S2_addasl_rrri renamable $r0, killed renamable $r10, 1
+
+ bb.4:
+ successors: %bb.5(0x40000000), %bb.6(0x40000000)
+ liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12, $r13, $r14, $r15, $r28
+
+ renamable $v0, renamable $r14 = V6_vL32b_pi killed renamable $r14, 64
+ renamable $p1 = C2_cmpgtui renamable $r1, 1
+ renamable $r10 = A2_addi renamable $r1, -1
+ renamable $v2, renamable $r28 = V6_vL32b_pi killed renamable $r28, 64
+ renamable $v1 = V6_vaddh renamable $v0, renamable $v2
+ renamable $v3, renamable $r15 = V6_vL32b_pi killed renamable $r15, 64
+ renamable $v0 = V6_vsubh killed renamable $v0, killed renamable $v2
+ J2_loop0r %bb.5, killed renamable $r10, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+ renamable $v4, renamable $r13 = V6_vL32b_pi killed renamable $r13, 64
+ renamable $v2 = V6_vaddh renamable $v3, renamable $v4
+ J2_jumpf killed renamable $p1, %bb.6, implicit-def $pc
+ J2_jump %bb.5, implicit-def $pc
+
+ bb.5:
+ successors: %bb.5(0x7c000000), %bb.6(0x04000000)
+ liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12, $r13, $r14, $r15, $r28, $v0, $v1, $v2, $v3, $v4
+
+ renamable $v3 = V6_vsubh killed renamable $v3, killed renamable $v4
+ renamable $v4, renamable $r14 = V6_vL32b_pi killed renamable $r14, 64
+ renamable $v5 = V6_vnavgh renamable $v1, renamable $v2
+ renamable $v1 = V6_vavgh killed renamable $v1, killed renamable $v2
+ renamable $v2, renamable $r28 = V6_vL32b_pi killed renamable $r28, 64
+ renamable $v1 = V6_vsathub killed renamable $v5, killed renamable $v1
+ renamable $v5 = V6_vnavgh renamable $v0, renamable $v3
+ renamable $v6 = V6_vavgh killed renamable $v0, killed renamable $v3
+ renamable $r12 = V6_vS32b_pi killed renamable $r12, 64, killed renamable $v1
+ renamable $v1 = V6_vaddh renamable $v4, renamable $v2
+ renamable $v3, renamable $r15 = V6_vL32b_pi killed renamable $r15, 64
+ renamable $v0 = V6_vsubh killed renamable $v4, killed renamable $v2
+ renamable $v4, renamable $r13 = V6_vL32b_pi killed renamable $r13, 64
+ renamable $v2 = V6_vaddh renamable $v3, renamable $v4
+ renamable $v5 = V6_vsathub killed renamable $v5, killed renamable $v6
+ renamable $r9 = V6_vS32b_pi killed renamable $r9, 64, killed renamable $v5
+ ENDLOOP0 %bb.5, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+ J2_jump %bb.6, implicit-def $pc
+
+ bb.6:
+ successors: %bb.7(0x80000000)
+ liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12, $v0, $v1, $v2, $v3, $v4
+
+ renamable $v3 = V6_vsubh killed renamable $v3, killed renamable $v4
+ renamable $v4 = V6_vavgh renamable $v1, renamable $v2
+ renamable $v1 = V6_vnavgh killed renamable $v1, killed renamable $v2
+ renamable $v2 = V6_vavgh renamable $v0, renamable $v3
+ renamable $v0 = V6_vnavgh killed renamable $v0, killed renamable $v3
+ renamable $v1 = V6_vsathub killed renamable $v1, killed renamable $v4
+ dead renamable $r12 = V6_vS32b_pi killed renamable $r12, 64, killed renamable $v1
+ renamable $v0 = V6_vsathub killed renamable $v0, killed renamable $v2
+ dead renamable $r9 = V6_vS32b_pi killed renamable $r9, 64, killed renamable $v0
+ J2_jump %bb.7, implicit-def $pc
+
+ bb.7:
+ successors: %bb.2(0x7c000000), %bb.8(0x04000000)
+ liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8
+
+ renamable $r7 = nsw A2_addi killed renamable $r7, 2
+ ENDLOOP1 %bb.2, implicit-def $pc, implicit-def $lc1, implicit $sa1, implicit $lc1
+ J2_jump %bb.8, implicit-def dead $pc
+
+ bb.8:
+ PS_jmpret $r31, implicit-def dead $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/v6-haar-balign32.ll b/llvm/test/CodeGen/Hexagon/v6-haar-balign32.ll
new file mode 100644
index 00000000000000..6b3c0a94a494dc
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/v6-haar-balign32.ll
@@ -0,0 +1,117 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv73 -O2 -mattr=+hvxv73,hvx-length64b < %s | FileCheck %s
+; CHECK: .p2align{{.*}}5
+
+; Function Attrs: nounwind
+define void @wobble(ptr noalias nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3, ptr noalias nocapture %arg4, i32 %arg5) #0 {
+bb:
+ %ashr = ashr i32 %arg3, 2
+ %ashr6 = ashr i32 %arg3, 1
+ %add = add nsw i32 %ashr6, %ashr
+ %icmp = icmp sgt i32 %arg2, 0
+ br i1 %icmp, label %bb7, label %bb61
+
+bb7: ; preds = %bb
+ %sdiv = sdiv i32 %arg1, 64
+ %icmp8 = icmp sgt i32 %arg1, 63
+ br label %bb9
+
+bb9: ; preds = %bb57, %bb7
+ %phi = phi i32 [ 0, %bb7 ], [ %add58, %bb57 ]
+ %ashr10 = ashr exact i32 %phi, 1
+ %mul = mul nsw i32 %ashr10, %arg3
+ br i1 %icmp8, label %bb11, label %bb57
+
+bb11: ; preds = %bb9
+ %add12 = add nsw i32 %phi, 1
+ %mul13 = mul nsw i32 %add12, %arg5
+ %mul14 = mul nsw i32 %phi, %arg5
+ %add15 = add i32 %add, %mul
+ %add16 = add i32 %mul, %ashr
+ %add17 = add i32 %mul, %ashr6
+ %getelementptr = getelementptr inbounds i8, ptr %arg4, i32 %mul13
+ %getelementptr18 = getelementptr inbounds i8, ptr %arg4, i32 %mul14
+ %getelementptr19 = getelementptr inbounds i16, ptr %arg, i32 %add15
+ %getelementptr20 = getelementptr inbounds i16, ptr %arg, i32 %add16
+ %getelementptr21 = getelementptr inbounds i16, ptr %arg, i32 %add17
+ %getelementptr22 = getelementptr inbounds i16, ptr %arg, i32 %mul
+ %bitcast = bitcast ptr %getelementptr to ptr
+ %bitcast23 = bitcast ptr %getelementptr18 to ptr
+ %bitcast24 = bitcast ptr %getelementptr19 to ptr
+ %bitcast25 = bitcast ptr %getelementptr20 to ptr
+ %bitcast26 = bitcast ptr %getelementptr21 to ptr
+ %bitcast27 = bitcast ptr %getelementptr22 to ptr
+ br label %bb28
+
+bb28: ; preds = %bb28, %bb11
+ %phi29 = phi i32 [ 0, %bb11 ], [ %add54, %bb28 ]
+ %phi30 = phi ptr [ %bitcast27, %bb11 ], [ %getelementptr36, %bb28 ]
+ %phi31 = phi ptr [ %bitcast26, %bb11 ], [ %getelementptr37, %bb28 ]
+ %phi32 = phi ptr [ %bitcast25, %bb11 ], [ %getelementptr39, %bb28 ]
+ %phi33 = phi ptr [ %bitcast24, %bb11 ], [ %getelementptr41, %bb28 ]
+ %phi34 = phi ptr [ %bitcast, %bb11 ], [ %getelementptr53, %bb28 ]
+ %phi35 = phi ptr [ %bitcast23, %bb11 ], [ %getelementptr52, %bb28 ]
+ %getelementptr36 = getelementptr inbounds <16 x i32>, ptr %phi30, i32 1
+ %load = load <16 x i32>, ptr %phi30, align 64, !tbaa !1
+ %getelementptr37 = getelementptr inbounds <16 x i32>, ptr %phi31, i32 1
+ %load38 = load <16 x i32>, ptr %phi31, align 64, !tbaa !1
+ %getelementptr39 = getelementptr inbounds <16 x i32>, ptr %phi32, i32 1
+ %load40 = load <16 x i32>, ptr %phi32, align 64, !tbaa !1
+ %getelementptr41 = getelementptr inbounds <16 x i32>, ptr %phi33, i32 1
+ %load42 = load <16 x i32>, ptr %phi33, align 64, !tbaa !1
+ %call = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load, <16 x i32> %load38)
+ %call43 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load, <16 x i32> %load38)
+ %call44 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load40, <16 x i32> %load42)
+ %call45 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load40, <16 x i32> %load42)
+ %call46 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call, <16 x i32> %call44)
+ %call47 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call, <16 x i32> %call44)
+ %call48 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call43, <16 x i32> %call45)
+ %call49 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call43, <16 x i32> %call45)
+ %call50 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call47, <16 x i32> %call46)
+ %call51 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call49, <16 x i32> %call48)
+ %getelementptr52 = getelementptr inbounds <16 x i32>, ptr %phi35, i32 1
+ store <16 x i32> %call50, ptr %phi35, align 64, !tbaa !1
+ %getelementptr53 = getelementptr inbounds <16 x i32>, ptr %phi34, i32 1
+ store <16 x i32> %call51, ptr %phi34, align 64, !tbaa !1
+ %add54 = add nsw i32 %phi29, 1
+ %icmp55 = icmp slt i32 %add54, %sdiv
+ br i1 %icmp55, label %bb28, label %bb56
+
+bb56: ; preds = %bb28
+ br label %bb57
+
+bb57: ; preds = %bb56, %bb9
+ %add58 = add nsw i32 %phi, 2
+ %icmp59 = icmp slt i32 %add58, %arg2
+ br i1 %icmp59, label %bb9, label %bb60
+
+bb60: ; preds = %bb57
+ br label %bb61
+
+bb61: ; preds = %bb60, %bb
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32>, <16 x i32>) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"Clang 3.1"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
More information about the llvm-commits
mailing list