[clang] [llvm] [Hexagon] Add Loop Alignment pass. (PR #83379)

Thu Feb 29 12:56:51 PST 2024

https://github.com/sgundapa updated https://github.com/llvm/llvm-project/pull/83379

>From 973d204ae0d8370704f1613e5206ac330a40e8f4 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sgundapa at quicinc.com>
Date: Wed, 28 Feb 2024 12:23:35 -0800
Subject: [PATCH] [Hexagon] Add Loop Alignment pass.

Inspect a basic block and if its single basic block loop with a small
number of instructions, set the Loop Alignment to 32 bytes.
This will avoid the cache line break in the first packet of
loop which will cause a stall per each execution of loop.
---
 clang/test/CodeGen/builtins-hexagon.c         |   2 +-
 llvm/lib/Target/Hexagon/CMakeLists.txt        |   1 +
 llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp  | 216 ++++++++++++++++++
 .../Target/Hexagon/HexagonTargetMachine.cpp   |   9 +-
 .../lib/Target/Hexagon/HexagonTargetMachine.h |   1 +
 llvm/test/CodeGen/Hexagon/loop-balign.ll      |  91 ++++++++
 llvm/test/CodeGen/Hexagon/loop_align_count.ll | 115 ++++++++++
 .../test/CodeGen/Hexagon/loop_align_count.mir | 130 +++++++++++
 llvm/test/CodeGen/Hexagon/v6-haar-balign32.ll | 117 ++++++++++
 9 files changed, 680 insertions(+), 2 deletions(-)
 create mode 100644 llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp
 create mode 100644 llvm/test/CodeGen/Hexagon/loop-balign.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/loop_align_count.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/loop_align_count.mir
 create mode 100644 llvm/test/CodeGen/Hexagon/v6-haar-balign32.ll

diff --git a/clang/test/CodeGen/builtins-hexagon.c b/clang/test/CodeGen/builtins-hexagon.c
index 9a1b733da5cdb8..52073f27ae70f5 100644
--- a/clang/test/CodeGen/builtins-hexagon.c
+++ b/clang/test/CodeGen/builtins-hexagon.c
@@ -1,5 +1,5 @@
 // REQUIRES: hexagon-registered-target
-// RUN: %clang_cc1 -triple hexagon-unknown-elf -target-cpu hexagonv65 -target-feature +hvxv65 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple hexagon-unknown-elf -target-cpu hexagonv65 -target-feature +hvxv65 -target-feature +hvx-length128b -emit-llvm %s -o - | FileCheck %s
 
 void test() {
   int v64 __attribute__((__vector_size__(64)));
diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index a22a5c11e6ab3a..cdc062eee72b1e 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -43,6 +43,7 @@ add_llvm_target(HexagonCodeGen
   HexagonISelDAGToDAGHVX.cpp
   HexagonISelLowering.cpp
   HexagonISelLoweringHVX.cpp
+  HexagonLoopAlign.cpp
   HexagonLoopIdiomRecognition.cpp
   HexagonMachineFunctionInfo.cpp
   HexagonMachineScheduler.cpp
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp b/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp
new file mode 100644
index 00000000000000..c79b528ff2f3f9
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp
@@ -0,0 +1,216 @@
+//===----- HexagonLoopAlign.cpp - Generate loop alignment directives  -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Inspect a basic block and if its single basic block loop with a small
+// number of instructions, set the prefLoopAlignment to 32 bytes (5).
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-loop-align"
+
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+    DisableLoopAlign("disable-hexagon-loop-align", cl::Hidden,
+                     cl::desc("Disable Hexagon loop alignment pass"));
+
+static cl::opt<uint32_t> HVXLoopAlignLimitUB(
+    "hexagon-hvx-loop-align-limit-ub", cl::Hidden, cl::init(16),
+    cl::desc("Set hexagon hvx loop upper bound align limit"));
+
+static cl::opt<uint32_t> TinyLoopAlignLimitUB(
+    "hexagon-tiny-loop-align-limit-ub", cl::Hidden, cl::init(16),
+    cl::desc("Set hexagon tiny-core loop upper bound align limit"));
+
+static cl::opt<uint32_t>
+    LoopAlignLimitUB("hexagon-loop-align-limit-ub", cl::Hidden, cl::init(8),
+                     cl::desc("Set hexagon loop upper bound align limit"));
+
+static cl::opt<uint32_t>
+    LoopAlignLimitLB("hexagon-loop-align-limit-lb", cl::Hidden, cl::init(4),
+                     cl::desc("Set hexagon loop lower bound align limit"));
+
+static cl::opt<uint32_t>
+    LoopBndlAlignLimit("hexagon-loop-bundle-align-limit", cl::Hidden,
+                       cl::init(4),
+                       cl::desc("Set hexagon loop align bundle limit"));
+
+static cl::opt<uint32_t> TinyLoopBndlAlignLimit(
+    "hexagon-tiny-loop-bundle-align-limit", cl::Hidden, cl::init(8),
+    cl::desc("Set hexagon tiny-core loop align bundle limit"));
+
+static cl::opt<uint32_t>
+    LoopEdgeThreshold("hexagon-loop-edge-threshold", cl::Hidden, cl::init(7500),
+                      cl::desc("Set hexagon loop align edge theshold"));
+
+namespace llvm {
+FunctionPass *createHexagonLoopAlign();
+void initializeHexagonLoopAlignPass(PassRegistry &);
+} // namespace llvm
+
+namespace {
+
+class HexagonLoopAlign : public MachineFunctionPass {
+  const HexagonSubtarget *HST = nullptr;
+  const TargetMachine *HTM = nullptr;
+  const HexagonInstrInfo *HII = nullptr;
+
+public:
+  static char ID;
+  HexagonLoopAlign() : MachineFunctionPass(ID) {
+    initializeHexagonLoopAlignPass(*PassRegistry::getPassRegistry());
+  }
+  bool shouldBalignLoop(MachineBasicBlock &BB, bool AboveThres);
+  bool isSingleLoop(MachineBasicBlock &MBB);
+  bool attemptToBalignSmallLoop(MachineFunction &MF, MachineBasicBlock &MBB);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineBranchProbabilityInfo>();
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return "Hexagon LoopAlign pass"; }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+char HexagonLoopAlign::ID = 0;
+
+bool HexagonLoopAlign::shouldBalignLoop(MachineBasicBlock &BB,
+                                        bool AboveThres) {
+  bool isVec = false;
+  unsigned InstCnt = 0;
+  unsigned BndlCnt = 0;
+
+  for (MachineBasicBlock::instr_iterator II = BB.instr_begin(),
+                                         IE = BB.instr_end();
+       II != IE; ++II) {
+
+    // End if the instruction is endloop.
+    if (HII->isEndLoopN(II->getOpcode()))
+      break;
+    // Count the number of bundles.
+    if (II->isBundle()) {
+      BndlCnt++;
+      continue;
+    }
+    // Skip over debug instructions.
+    if (II->isDebugInstr())
+      continue;
+    // Check if there are any HVX instructions in loop.
+    isVec |= HII->isHVXVec(*II);
+    // Count the number of instructions.
+    InstCnt++;
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "Bundle Count : " << BndlCnt << "\n";
+    dbgs() << "Instruction Count : " << InstCnt << "\n";
+  });
+
+  unsigned LimitUB = 0;
+  unsigned LimitBndl = LoopBndlAlignLimit;
+  // The conditions in the order of priority.
+  if (HST->isTinyCore()) {
+    LimitUB = TinyLoopAlignLimitUB;
+    LimitBndl = TinyLoopBndlAlignLimit;
+  } else if (isVec)
+    LimitUB = HVXLoopAlignLimitUB;
+  else if (AboveThres)
+    LimitUB = LoopAlignLimitUB;
+
+  // if the upper bound is not set to a value, implies we didn't meet
+  // the criteria.
+  if (LimitUB == 0)
+    return false;
+
+  return InstCnt >= LoopAlignLimitLB && InstCnt <= LimitUB &&
+         BndlCnt <= LimitBndl;
+}
+
+bool HexagonLoopAlign::isSingleLoop(MachineBasicBlock &MBB) {
+  int Succs = MBB.succ_size();
+  return (MBB.isSuccessor(&MBB) && (Succs == 2));
+}
+
+bool HexagonLoopAlign::attemptToBalignSmallLoop(MachineFunction &MF,
+                                                MachineBasicBlock &MBB) {
+  if (!isSingleLoop(MBB))
+    return false;
+
+  const MachineBranchProbabilityInfo *MBPI =
+      &getAnalysis<MachineBranchProbabilityInfo>();
+  const MachineBlockFrequencyInfo *MBFI =
+      &getAnalysis<MachineBlockFrequencyInfo>();
+
+  // Compute frequency of back edge,
+  BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
+  BranchProbability BrProb = MBPI->getEdgeProbability(&MBB, &MBB);
+  BlockFrequency EdgeFreq = BlockFreq * BrProb;
+  LLVM_DEBUG({
+    dbgs() << "Loop Align Pass:\n";
+    dbgs() << "\tedge with freq(" << EdgeFreq.getFrequency() << ")\n";
+  });
+
+  bool AboveThres = EdgeFreq.getFrequency() > LoopEdgeThreshold;
+  if (shouldBalignLoop(MBB, AboveThres)) {
+    // We found a loop, change its alignment to be 32 (5).
+    MBB.setAlignment(llvm::Align(1 << 5));
+    return true;
+  }
+  return false;
+}
+
+// Inspect each basic block, and if its a single BB loop, see if it
+// meets the criteria for increasing alignment to 32.
+
+bool HexagonLoopAlign::runOnMachineFunction(MachineFunction &MF) {
+
+  HST = &MF.getSubtarget<HexagonSubtarget>();
+  HII = HST->getInstrInfo();
+  HTM = &MF.getTarget();
+
+  if (skipFunction(MF.getFunction()))
+    return false;
+  if (DisableLoopAlign)
+    return false;
+
+  // This optimization is performed at
+  // i) -O2 and above, and  when the loop has a HVX instruction.
+  // ii) -O3
+  if (HST->useHVXOps()) {
+    if (HTM->getOptLevel() < CodeGenOptLevel::Default)
+      return false;
+  } else {
+    if (HTM->getOptLevel() < CodeGenOptLevel::Aggressive)
+      return false;
+  }
+
+  bool Changed = false;
+  for (MachineFunction::iterator MBBi = MF.begin(), MBBe = MF.end();
+       MBBi != MBBe; ++MBBi) {
+    MachineBasicBlock &MBB = *MBBi;
+    Changed |= attemptToBalignSmallLoop(MF, MBB);
+  }
+  return Changed;
+}
+
+} // namespace
+
+INITIALIZE_PASS(HexagonLoopAlign, "hexagon-loop-align",
+                "Hexagon LoopAlign pass", false, false)
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonLoopAlign() { return new HexagonLoopAlign(); }
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 7d77286339399d..3c346c334d6d30 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -164,6 +164,7 @@ namespace llvm {
   void initializeHexagonGenMuxPass(PassRegistry&);
   void initializeHexagonHardwareLoopsPass(PassRegistry&);
   void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &);
+  void initializeHexagonLoopAlignPass(PassRegistry &);
   void initializeHexagonNewValueJumpPass(PassRegistry&);
   void initializeHexagonOptAddrModePass(PassRegistry&);
   void initializeHexagonPacketizerPass(PassRegistry&);
@@ -194,6 +195,7 @@ namespace llvm {
   FunctionPass *createHexagonHardwareLoops();
   FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
                                      CodeGenOptLevel OptLevel);
+  FunctionPass *createHexagonLoopAlign();
   FunctionPass *createHexagonLoopRescheduling();
   FunctionPass *createHexagonNewValueJump();
   FunctionPass *createHexagonOptAddrMode();
@@ -256,8 +258,10 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
           TT, CPU, FS, Options, getEffectiveRelocModel(RM),
           getEffectiveCodeModel(CM, CodeModel::Small),
           (HexagonNoOpt ? CodeGenOptLevel::None : OL)),
-      TLOF(std::make_unique<HexagonTargetObjectFile>()) {
+      TLOF(std::make_unique<HexagonTargetObjectFile>()),
+      Subtarget(Triple(TT), CPU, FS, *this) {
   initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
+  initializeHexagonLoopAlignPass(*PassRegistry::getPassRegistry());
   initializeHexagonTfrCleanupPass(*PassRegistry::getPassRegistry());
   initAsmInfo();
 }
@@ -476,6 +480,9 @@ void HexagonPassConfig::addPreEmitPass() {
   // Packetization is mandatory: it handles gather/scatter at all opt levels.
   addPass(createHexagonPacketizer(NoOpt));
 
+  if (!NoOpt)
+    addPass(createHexagonLoopAlign());
+
   if (EnableVectorPrint)
     addPass(createHexagonVectorPrint());
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
index c5fed0cd65a814..34ff45b6acf345 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -23,6 +23,7 @@ namespace llvm {
 
 class HexagonTargetMachine : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  HexagonSubtarget Subtarget;
   mutable StringMap<std::unique_ptr<HexagonSubtarget>> SubtargetMap;
 
 public:
diff --git a/llvm/test/CodeGen/Hexagon/loop-balign.ll b/llvm/test/CodeGen/Hexagon/loop-balign.ll
new file mode 100644
index 00000000000000..9d1f42a4b14b18
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/loop-balign.ll
@@ -0,0 +1,91 @@
+; RUN: llc -march=hexagon -O3  < %s | FileCheck %s -check-prefix=BALIGN
+; BALIGN: .p2align{{.*}}5
+
+; The test for checking the alignment of 'for.body4.for.body4_crit_edge' basic block
+
+define dso_local void @foo(i32 %nCol, i32 %nRow, ptr nocapture %resMat) local_unnamed_addr {
+entry:
+  %shl = shl i32 %nRow, 2
+  %cmp36 = icmp sgt i32 %nRow, 0
+  %0 = add i32 %nCol, -1
+  %.inv = icmp slt i32 %0, 1
+  %1 = select i1 %.inv, i32 1, i32 %nCol
+  br label %Outerloop
+
+Outerloop:                                        ; preds = %for.end7, %entry
+  %r12.0 = phi i32 [ 0, %entry ], [ %inc8, %for.end7 ]
+  %r7_6.0 = phi i64 [ undef, %entry ], [ %r7_6.1.lcssa, %for.end7 ]
+  %r0i.0 = phi i32 [ undef, %entry ], [ %r0i.1.lcssa, %for.end7 ]
+  %r5.0 = phi ptr [ %resMat, %entry ], [ %r5.1.lcssa, %for.end7 ]
+  %r8.0 = phi i32 [ %shl, %entry ], [ %r8.1.lcssa, %for.end7 ]
+  br i1 %cmp36, label %for.body.lr.ph, label %for.end7
+
+for.body.lr.ph:                                   ; preds = %Outerloop
+  %cmp332 = icmp eq i32 %r12.0, 0
+  %exitcond.peel = icmp eq i32 %r12.0, 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.end, %for.body.lr.ph
+  %r8.141 = phi i32 [ %r8.0, %for.body.lr.ph ], [ %add, %for.end ]
+  %r5.140 = phi ptr [ %r5.0, %for.body.lr.ph ], [ %add.ptr, %for.end ]
+  %i.039 = phi i32 [ 0, %for.body.lr.ph ], [ %inc6, %for.end ]
+  %r0i.138 = phi i32 [ %r0i.0, %for.body.lr.ph ], [ %4, %for.end ]
+  %r7_6.137 = phi i64 [ %r7_6.0, %for.body.lr.ph ], [ %r7_6.2.lcssa, %for.end ]
+  %add = add nsw i32 %r8.141, %shl
+  br i1 %cmp332, label %for.end, label %for.body4.peel
+
+for.body4.peel:                                   ; preds = %for.body
+  %r1i.0.in.peel = inttoptr i32 %r8.141 to ptr
+  %r1i.0.peel = load i32, ptr %r1i.0.in.peel, align 4
+  %2 = tail call i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64 %r7_6.137, i32 %r1i.0.peel, i32 %r0i.138)
+  br i1 %exitcond.peel, label %for.end, label %for.body4.preheader.peel.newph
+
+for.body4.preheader.peel.newph:                   ; preds = %for.body4.peel
+  %r1i.0.in = inttoptr i32 %add to ptr
+  %r1i.0 = load i32, ptr %r1i.0.in, align 4
+  br label %for.body4
+
+for.body4:                                        ; preds = %for.body4.for.body4_crit_edge, %for.body4.preheader.peel.newph
+  %inc.phi = phi i32 [ %inc.0, %for.body4.for.body4_crit_edge ], [ 2, %for.body4.preheader.peel.newph ]
+  %r7_6.233 = phi i64 [ %3, %for.body4.for.body4_crit_edge ], [ %2, %for.body4.preheader.peel.newph ]
+  %3 = tail call i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64 %r7_6.233, i32 %r1i.0, i32 %r0i.138)
+  %exitcond = icmp eq i32 %inc.phi, %r12.0
+  br i1 %exitcond, label %for.end.loopexit, label %for.body4.for.body4_crit_edge
+
+for.body4.for.body4_crit_edge:                    ; preds = %for.body4
+  %inc.0 = add nuw nsw i32 %inc.phi, 1
+  br label %for.body4
+
+for.end.loopexit:                                 ; preds = %for.body4
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %for.body4.peel, %for.body
+  %r7_6.2.lcssa = phi i64 [ %r7_6.137, %for.body ], [ %2, %for.body4.peel ], [ %3, %for.end.loopexit ]
+  %4 = tail call i32 @llvm.hexagon.S2.clbp(i64 %r7_6.2.lcssa)
+  store i32 %4, ptr %r5.140, align 4
+  %add.ptr = getelementptr inbounds i8, ptr %r5.140, i32 undef
+  %inc6 = add nuw nsw i32 %i.039, 1
+  %exitcond47 = icmp eq i32 %inc6, %nRow
+  br i1 %exitcond47, label %for.end7.loopexit, label %for.body
+
+for.end7.loopexit:                                ; preds = %for.end
+  br label %for.end7
+
+for.end7:                                         ; preds = %for.end7.loopexit, %Outerloop
+  %r7_6.1.lcssa = phi i64 [ %r7_6.0, %Outerloop ], [ %r7_6.2.lcssa, %for.end7.loopexit ]
+  %r0i.1.lcssa = phi i32 [ %r0i.0, %Outerloop ], [ %4, %for.end7.loopexit ]
+  %r5.1.lcssa = phi ptr [ %r5.0, %Outerloop ], [ %add.ptr, %for.end7.loopexit ]
+  %r8.1.lcssa = phi i32 [ %r8.0, %Outerloop ], [ %add, %for.end7.loopexit ]
+  %inc8 = add nuw i32 %r12.0, 1
+  %exitcond48 = icmp eq i32 %inc8, %1
+  br i1 %exitcond48, label %if.end, label %Outerloop
+
+if.end:                                           ; preds = %for.end7
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64, i32, i32) 
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.S2.clbp(i64)
diff --git a/llvm/test/CodeGen/Hexagon/loop_align_count.ll b/llvm/test/CodeGen/Hexagon/loop_align_count.ll
new file mode 100644
index 00000000000000..07d7e4a8d61176
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/loop_align_count.ll
@@ -0,0 +1,115 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv73 -O2 -mattr=+hvxv73,hvx-length64b \
+; RUN: -debug-only=hexagon-loop-align 2>&1 < %s | FileCheck %s
+; Validate that there are 4 bundles in the loop.
+
+; CHECK: Loop Align Pass:
+; CHECK: Bundle Count : 4
+; CHECK: .p2align{{.*}}5
+
+; Function Attrs: nounwind
+define void @ham(ptr noalias nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3, ptr noalias nocapture %arg4, i32 %arg5) #0 {
+bb:
+  %ashr = ashr i32 %arg3, 2
+  %ashr6 = ashr i32 %arg3, 1
+  %add = add nsw i32 %ashr6, %ashr
+  %icmp = icmp sgt i32 %arg2, 0
+  br i1 %icmp, label %bb7, label %bb61
+
+bb7:                                              ; preds = %bb
+  %sdiv = sdiv i32 %arg1, 64
+  %icmp8 = icmp sgt i32 %arg1, 63
+  br label %bb9
+
+bb9:                                              ; preds = %bb57, %bb7
+  %phi = phi i32 [ 0, %bb7 ], [ %add58, %bb57 ]
+  %ashr10 = ashr exact i32 %phi, 1
+  %mul = mul nsw i32 %ashr10, %arg3
+  br i1 %icmp8, label %bb11, label %bb57
+
+bb11:                                             ; preds = %bb9
+  %add12 = add nsw i32 %phi, 1
+  %mul13 = mul nsw i32 %add12, %arg5
+  %mul14 = mul nsw i32 %phi, %arg5
+  %add15 = add i32 %add, %mul
+  %add16 = add i32 %mul, %ashr
+  %add17 = add i32 %mul, %ashr6
+  %getelementptr = getelementptr inbounds i8, ptr %arg4, i32 %mul13
+  %getelementptr18 = getelementptr inbounds i8, ptr %arg4, i32 %mul14
+  %getelementptr19 = getelementptr inbounds i16, ptr %arg, i32 %add15
+  %getelementptr20 = getelementptr inbounds i16, ptr %arg, i32 %add16
+  %getelementptr21 = getelementptr inbounds i16, ptr %arg, i32 %add17
+  %getelementptr22 = getelementptr inbounds i16, ptr %arg, i32 %mul
+  %bitcast = bitcast ptr %getelementptr to ptr
+  %bitcast23 = bitcast ptr %getelementptr18 to ptr
+  %bitcast24 = bitcast ptr %getelementptr19 to ptr
+  %bitcast25 = bitcast ptr %getelementptr20 to ptr
+  %bitcast26 = bitcast ptr %getelementptr21 to ptr
+  %bitcast27 = bitcast ptr %getelementptr22 to ptr
+  br label %bb28
+
+bb28:                                             ; preds = %bb28, %bb11
+  %phi29 = phi i32 [ 0, %bb11 ], [ %add54, %bb28 ]
+  %phi30 = phi ptr [ %bitcast27, %bb11 ], [ %getelementptr36, %bb28 ]
+  %phi31 = phi ptr [ %bitcast26, %bb11 ], [ %getelementptr37, %bb28 ]
+  %phi32 = phi ptr [ %bitcast25, %bb11 ], [ %getelementptr39, %bb28 ]
+  %phi33 = phi ptr [ %bitcast24, %bb11 ], [ %getelementptr41, %bb28 ]
+  %phi34 = phi ptr [ %bitcast, %bb11 ], [ %getelementptr53, %bb28 ]
+  %phi35 = phi ptr [ %bitcast23, %bb11 ], [ %getelementptr52, %bb28 ]
+  %getelementptr36 = getelementptr inbounds <16 x i32>, ptr %phi30, i32 1
+  %load = load <16 x i32>, ptr %phi30, align 64
+  %getelementptr37 = getelementptr inbounds <16 x i32>, ptr %phi31, i32 1
+  %load38 = load <16 x i32>, ptr %phi31, align 64
+  %getelementptr39 = getelementptr inbounds <16 x i32>, ptr %phi32, i32 1
+  %load40 = load <16 x i32>, ptr %phi32, align 64
+  %getelementptr41 = getelementptr inbounds <16 x i32>, ptr %phi33, i32 1
+  %load42 = load <16 x i32>, ptr %phi33, align 64
+  %call = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load, <16 x i32> %load38)
+  %call43 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load, <16 x i32> %load38)
+  %call44 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load40, <16 x i32> %load42)
+  %call45 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load40, <16 x i32> %load42)
+  %call46 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call, <16 x i32> %call44)
+  %call47 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call, <16 x i32> %call44)
+  %call48 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call43, <16 x i32> %call45)
+  %call49 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call43, <16 x i32> %call45)
+  %call50 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call47, <16 x i32> %call46)
+  %call51 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call49, <16 x i32> %call48)
+  %getelementptr52 = getelementptr inbounds <16 x i32>, ptr %phi35, i32 1
+  store <16 x i32> %call50, ptr %phi35, align 64
+  %getelementptr53 = getelementptr inbounds <16 x i32>, ptr %phi34, i32 1
+  store <16 x i32> %call51, ptr %phi34, align 64
+  %add54 = add nsw i32 %phi29, 1
+  %icmp55 = icmp slt i32 %add54, %sdiv
+  br i1 %icmp55, label %bb28, label %bb56
+
+bb56:                                             ; preds = %bb28
+  br label %bb57
+
+bb57:                                             ; preds = %bb56, %bb9
+  %add58 = add nsw i32 %phi, 2
+  %icmp59 = icmp slt i32 %add58, %arg2
+  br i1 %icmp59, label %bb9, label %bb60
+
+bb60:                                             ; preds = %bb57
+  br label %bb61
+
+bb61:                                             ; preds = %bb60, %bb
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32>, <16 x i32>) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/Hexagon/loop_align_count.mir b/llvm/test/CodeGen/Hexagon/loop_align_count.mir
new file mode 100644
index 00000000000000..afbd917f4f0db3
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/loop_align_count.mir
@@ -0,0 +1,130 @@
+# RUN: llc -march=hexagon -O3 -run-pass hexagon-loop-align -o - %s\
+# RUN: -debug-only=hexagon-loop-align -verify-machineinstrs 2>&1 | FileCheck %s
+
+# Test that we only count til endloop instruction and we align this
+# loop to 32.
+# CHECK: Loop Align Pass:
+# CHECK: Instruction Count : 16
+# CHECK: bb.5 (align 32)
+---
+name: fred
+tracksRegLiveness: true
+
+body:             |
+  bb.0:
+    successors: %bb.1(0x50000000), %bb.8(0x30000000)
+    liveins: $r0, $r1, $r2, $r3, $r4, $r5
+
+    renamable $p0 = C2_cmpgti renamable $r2, 0
+    J2_jumpf killed renamable $p0, %bb.8, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+    liveins: $r0, $r1, $r2, $r3, $r4, $r5
+
+    renamable $r7 = A2_addi killed renamable $r2, 1
+    renamable $r8 = S2_asr_i_r renamable $r1, 31
+    renamable $p0 = C2_cmpgti renamable $r1, 63
+    renamable $r2 = S2_asr_i_r renamable $r3, 2
+    renamable $r6 = S2_asr_i_r renamable $r3, 1
+    renamable $r9 = S2_lsr_i_r killed renamable $r7, 1
+    renamable $r1 = S2_lsr_i_r_acc killed renamable $r1, killed renamable $r8, 26
+    renamable $r7 = A2_tfrsi 0
+    renamable $r1 = S2_asr_i_r killed renamable $r1, 6
+    J2_loop1r %bb.2, killed renamable $r9, implicit-def $lc1, implicit-def $sa1
+    renamable $r8 = nsw A2_add renamable $r6, renamable $r2
+
+  bb.2:
+    successors: %bb.3(0x40000000), %bb.7(0x40000000)
+    liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8
+
+    J2_jumpf renamable $p0, %bb.7, implicit-def dead $pc
+    J2_jump %bb.3, implicit-def dead $pc
+
+  bb.3:
+    successors: %bb.4(0x80000000)
+    liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8
+
+    renamable $r13 = exact S2_asr_i_r renamable $r7, 1
+    renamable $r12 = COPY renamable $r4
+    renamable $r9 = COPY renamable $r4
+    renamable $r14 = nsw A2_addi renamable $r7, 1
+    renamable $r15 = nsw M2_mpyi killed renamable $r13, renamable $r3
+    renamable $r9 = M2_maci killed renamable $r9, killed renamable $r14, renamable $r5
+    renamable $r13 = A2_add renamable $r8, renamable $r15
+    renamable $r28 = A2_add renamable $r15, renamable $r2
+    renamable $r10 = A2_add renamable $r15, renamable $r6
+    renamable $r12 = M2_maci killed renamable $r12, renamable $r7, renamable $r5
+    renamable $r13 = S2_addasl_rrri renamable $r0, killed renamable $r13, 1
+    renamable $r14 = S2_addasl_rrri renamable $r0, killed renamable $r15, 1
+    renamable $r15 = S2_addasl_rrri renamable $r0, killed renamable $r28, 1
+    renamable $r28 = S2_addasl_rrri renamable $r0, killed renamable $r10, 1
+
+  bb.4:
+    successors: %bb.5(0x40000000), %bb.6(0x40000000)
+    liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12, $r13, $r14, $r15, $r28
+
+    renamable $v0, renamable $r14 = V6_vL32b_pi killed renamable $r14, 64
+    renamable $p1 = C2_cmpgtui renamable $r1, 1
+    renamable $r10 = A2_addi renamable $r1, -1
+    renamable $v2, renamable $r28 = V6_vL32b_pi killed renamable $r28, 64
+    renamable $v1 = V6_vaddh renamable $v0, renamable $v2
+    renamable $v3, renamable $r15 = V6_vL32b_pi killed renamable $r15, 64
+    renamable $v0 = V6_vsubh killed renamable $v0, killed renamable $v2
+    J2_loop0r %bb.5, killed renamable $r10, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    renamable $v4, renamable $r13 = V6_vL32b_pi killed renamable $r13, 64
+    renamable $v2 = V6_vaddh renamable $v3, renamable $v4
+    J2_jumpf killed renamable $p1, %bb.6, implicit-def $pc
+    J2_jump %bb.5, implicit-def $pc
+
+  bb.5:
+    successors: %bb.5(0x7c000000), %bb.6(0x04000000)
+    liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12, $r13, $r14, $r15, $r28, $v0, $v1, $v2, $v3, $v4
+
+    renamable $v3 = V6_vsubh killed renamable $v3, killed renamable $v4
+    renamable $v4, renamable $r14 = V6_vL32b_pi killed renamable $r14, 64
+    renamable $v5 = V6_vnavgh renamable $v1, renamable $v2
+    renamable $v1 = V6_vavgh killed renamable $v1, killed renamable $v2
+    renamable $v2, renamable $r28 = V6_vL32b_pi killed renamable $r28, 64
+    renamable $v1 = V6_vsathub killed renamable $v5, killed renamable $v1
+    renamable $v5 = V6_vnavgh renamable $v0, renamable $v3
+    renamable $v6 = V6_vavgh killed renamable $v0, killed renamable $v3
+    renamable $r12 = V6_vS32b_pi killed renamable $r12, 64, killed renamable $v1
+    renamable $v1 = V6_vaddh renamable $v4, renamable $v2
+    renamable $v3, renamable $r15 = V6_vL32b_pi killed renamable $r15, 64
+    renamable $v0 = V6_vsubh killed renamable $v4, killed renamable $v2
+    renamable $v4, renamable $r13 = V6_vL32b_pi killed renamable $r13, 64
+    renamable $v2 = V6_vaddh renamable $v3, renamable $v4
+    renamable $v5 = V6_vsathub killed renamable $v5, killed renamable $v6
+    renamable $r9 = V6_vS32b_pi killed renamable $r9, 64, killed renamable $v5
+    ENDLOOP0 %bb.5, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.6, implicit-def $pc
+
+  bb.6:
+    successors: %bb.7(0x80000000)
+    liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12, $v0, $v1, $v2, $v3, $v4
+
+    renamable $v3 = V6_vsubh killed renamable $v3, killed renamable $v4
+    renamable $v4 = V6_vavgh renamable $v1, renamable $v2
+    renamable $v1 = V6_vnavgh killed renamable $v1, killed renamable $v2
+    renamable $v2 = V6_vavgh renamable $v0, renamable $v3
+    renamable $v0 = V6_vnavgh killed renamable $v0, killed renamable $v3
+    renamable $v1 = V6_vsathub killed renamable $v1, killed renamable $v4
+    dead renamable $r12 = V6_vS32b_pi killed renamable $r12, 64, killed renamable $v1
+    renamable $v0 = V6_vsathub killed renamable $v0, killed renamable $v2
+    dead renamable $r9 = V6_vS32b_pi killed renamable $r9, 64, killed renamable $v0
+    J2_jump %bb.7, implicit-def $pc
+
+  bb.7:
+    successors: %bb.2(0x7c000000), %bb.8(0x04000000)
+    liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8
+
+    renamable $r7 = nsw A2_addi killed renamable $r7, 2
+    ENDLOOP1 %bb.2, implicit-def $pc, implicit-def $lc1, implicit $sa1, implicit $lc1
+    J2_jump %bb.8, implicit-def dead $pc
+
+  bb.8:
+    PS_jmpret $r31, implicit-def dead $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/v6-haar-balign32.ll b/llvm/test/CodeGen/Hexagon/v6-haar-balign32.ll
new file mode 100644
index 00000000000000..6b3c0a94a494dc
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/v6-haar-balign32.ll
@@ -0,0 +1,117 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv73 -O2 -mattr=+hvxv73,hvx-length64b  < %s | FileCheck %s
+; CHECK: .p2align{{.*}}5
+
+; Function Attrs: nounwind
+define void @wobble(ptr noalias nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3, ptr noalias nocapture %arg4, i32 %arg5) #0 {
+bb:
+  %ashr = ashr i32 %arg3, 2
+  %ashr6 = ashr i32 %arg3, 1
+  %add = add nsw i32 %ashr6, %ashr
+  %icmp = icmp sgt i32 %arg2, 0
+  br i1 %icmp, label %bb7, label %bb61
+
+bb7:                                              ; preds = %bb
+  %sdiv = sdiv i32 %arg1, 64
+  %icmp8 = icmp sgt i32 %arg1, 63
+  br label %bb9
+
+bb9:                                              ; preds = %bb57, %bb7
+  %phi = phi i32 [ 0, %bb7 ], [ %add58, %bb57 ]
+  %ashr10 = ashr exact i32 %phi, 1
+  %mul = mul nsw i32 %ashr10, %arg3
+  br i1 %icmp8, label %bb11, label %bb57
+
+bb11:                                             ; preds = %bb9
+  %add12 = add nsw i32 %phi, 1
+  %mul13 = mul nsw i32 %add12, %arg5
+  %mul14 = mul nsw i32 %phi, %arg5
+  %add15 = add i32 %add, %mul
+  %add16 = add i32 %mul, %ashr
+  %add17 = add i32 %mul, %ashr6
+  %getelementptr = getelementptr inbounds i8, ptr %arg4, i32 %mul13
+  %getelementptr18 = getelementptr inbounds i8, ptr %arg4, i32 %mul14
+  %getelementptr19 = getelementptr inbounds i16, ptr %arg, i32 %add15
+  %getelementptr20 = getelementptr inbounds i16, ptr %arg, i32 %add16
+  %getelementptr21 = getelementptr inbounds i16, ptr %arg, i32 %add17
+  %getelementptr22 = getelementptr inbounds i16, ptr %arg, i32 %mul
+  %bitcast = bitcast ptr %getelementptr to ptr
+  %bitcast23 = bitcast ptr %getelementptr18 to ptr
+  %bitcast24 = bitcast ptr %getelementptr19 to ptr
+  %bitcast25 = bitcast ptr %getelementptr20 to ptr
+  %bitcast26 = bitcast ptr %getelementptr21 to ptr
+  %bitcast27 = bitcast ptr %getelementptr22 to ptr
+  br label %bb28
+
+bb28:                                             ; preds = %bb28, %bb11
+  %phi29 = phi i32 [ 0, %bb11 ], [ %add54, %bb28 ]
+  %phi30 = phi ptr [ %bitcast27, %bb11 ], [ %getelementptr36, %bb28 ]
+  %phi31 = phi ptr [ %bitcast26, %bb11 ], [ %getelementptr37, %bb28 ]
+  %phi32 = phi ptr [ %bitcast25, %bb11 ], [ %getelementptr39, %bb28 ]
+  %phi33 = phi ptr [ %bitcast24, %bb11 ], [ %getelementptr41, %bb28 ]
+  %phi34 = phi ptr [ %bitcast, %bb11 ], [ %getelementptr53, %bb28 ]
+  %phi35 = phi ptr [ %bitcast23, %bb11 ], [ %getelementptr52, %bb28 ]
+  %getelementptr36 = getelementptr inbounds <16 x i32>, ptr %phi30, i32 1
+  %load = load <16 x i32>, ptr %phi30, align 64, !tbaa !1
+  %getelementptr37 = getelementptr inbounds <16 x i32>, ptr %phi31, i32 1
+  %load38 = load <16 x i32>, ptr %phi31, align 64, !tbaa !1
+  %getelementptr39 = getelementptr inbounds <16 x i32>, ptr %phi32, i32 1
+  %load40 = load <16 x i32>, ptr %phi32, align 64, !tbaa !1
+  %getelementptr41 = getelementptr inbounds <16 x i32>, ptr %phi33, i32 1
+  %load42 = load <16 x i32>, ptr %phi33, align 64, !tbaa !1
+  %call = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load, <16 x i32> %load38)
+  %call43 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load, <16 x i32> %load38)
+  %call44 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load40, <16 x i32> %load42)
+  %call45 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load40, <16 x i32> %load42)
+  %call46 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call, <16 x i32> %call44)
+  %call47 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call, <16 x i32> %call44)
+  %call48 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call43, <16 x i32> %call45)
+  %call49 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call43, <16 x i32> %call45)
+  %call50 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call47, <16 x i32> %call46)
+  %call51 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call49, <16 x i32> %call48)
+  %getelementptr52 = getelementptr inbounds <16 x i32>, ptr %phi35, i32 1
+  store <16 x i32> %call50, ptr %phi35, align 64, !tbaa !1
+  %getelementptr53 = getelementptr inbounds <16 x i32>, ptr %phi34, i32 1
+  store <16 x i32> %call51, ptr %phi34, align 64, !tbaa !1
+  %add54 = add nsw i32 %phi29, 1
+  %icmp55 = icmp slt i32 %add54, %sdiv
+  br i1 %icmp55, label %bb28, label %bb56
+
+bb56:                                             ; preds = %bb28
+  br label %bb57
+
+bb57:                                             ; preds = %bb56, %bb9
+  %add58 = add nsw i32 %phi, 2
+  %icmp59 = icmp slt i32 %add58, %arg2
+  br i1 %icmp59, label %bb9, label %bb60
+
+bb60:                                             ; preds = %bb57
+  br label %bb61
+
+bb61:                                             ; preds = %bb60, %bb
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32>, <16 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32>, <16 x i32>) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"Clang 3.1"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}