[clang] [llvm] [Hexagon] Add Loop Alignment pass. (PR #83379)

Thu Feb 29 12:57:20 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-clang

Author: Sumanth Gundapaneni (sgundapa)

<details>
<summary>Changes</summary>

Inspect a basic block and if its single basic block loop with a small number of instructions, set the Loop Alignment to 32 bytes. This will avoid the cache line break in the first packet of loop which will cause a stall per each execution of loop.

---

Patch is 34.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/83379.diff


9 Files Affected:

- (modified) clang/test/CodeGen/builtins-hexagon.c (+1-1) 
- (modified) llvm/lib/Target/Hexagon/CMakeLists.txt (+1) 
- (added) llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp (+216) 
- (modified) llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp (+8-1) 
- (modified) llvm/lib/Target/Hexagon/HexagonTargetMachine.h (+1) 
- (added) llvm/test/CodeGen/Hexagon/loop-balign.ll (+91) 
- (added) llvm/test/CodeGen/Hexagon/loop_align_count.ll (+115) 
- (added) llvm/test/CodeGen/Hexagon/loop_align_count.mir (+130) 
- (added) llvm/test/CodeGen/Hexagon/v6-haar-balign32.ll (+117) 


``````````diff

diff --git a/clang/test/CodeGen/builtins-hexagon.c b/clang/test/CodeGen/builtins-hexagon.c
index 9a1b733da5cdb8..52073f27ae70f5 100644
--- a/clang/test/CodeGen/builtins-hexagon.c
+++ b/clang/test/CodeGen/builtins-hexagon.c
@@ -1,5 +1,5 @@
 // REQUIRES: hexagon-registered-target
-// RUN: %clang_cc1 -triple hexagon-unknown-elf -target-cpu hexagonv65 -target-feature +hvxv65 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple hexagon-unknown-elf -target-cpu hexagonv65 -target-feature +hvxv65 -target-feature +hvx-length128b -emit-llvm %s -o - | FileCheck %s
 
 void test() {
   int v64 __attribute__((__vector_size__(64)));
diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index a22a5c11e6ab3a..cdc062eee72b1e 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -43,6 +43,7 @@ add_llvm_target(HexagonCodeGen
   HexagonISelDAGToDAGHVX.cpp
   HexagonISelLowering.cpp
   HexagonISelLoweringHVX.cpp
+  HexagonLoopAlign.cpp
   HexagonLoopIdiomRecognition.cpp
   HexagonMachineFunctionInfo.cpp
   HexagonMachineScheduler.cpp
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp b/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp
new file mode 100644
index 00000000000000..c79b528ff2f3f9
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp
@@ -0,0 +1,216 @@
+//===----- HexagonLoopAlign.cpp - Generate loop alignment directives  -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Inspect a basic block and if its single basic block loop with a small
+// number of instructions, set the prefLoopAlignment to 32 bytes (5).
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-loop-align"
+
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+    DisableLoopAlign("disable-hexagon-loop-align", cl::Hidden,
+                     cl::desc("Disable Hexagon loop alignment pass"));
+
+static cl::opt<uint32_t> HVXLoopAlignLimitUB(
+    "hexagon-hvx-loop-align-limit-ub", cl::Hidden, cl::init(16),
+    cl::desc("Set hexagon hvx loop upper bound align limit"));
+
+static cl::opt<uint32_t> TinyLoopAlignLimitUB(
+    "hexagon-tiny-loop-align-limit-ub", cl::Hidden, cl::init(16),
+    cl::desc("Set hexagon tiny-core loop upper bound align limit"));
+
+static cl::opt<uint32_t>
+    LoopAlignLimitUB("hexagon-loop-align-limit-ub", cl::Hidden, cl::init(8),
+                     cl::desc("Set hexagon loop upper bound align limit"));
+
+static cl::opt<uint32_t>
+    LoopAlignLimitLB("hexagon-loop-align-limit-lb", cl::Hidden, cl::init(4),
+                     cl::desc("Set hexagon loop lower bound align limit"));
+
+static cl::opt<uint32_t>
+    LoopBndlAlignLimit("hexagon-loop-bundle-align-limit", cl::Hidden,
+                       cl::init(4),
+                       cl::desc("Set hexagon loop align bundle limit"));
+
+static cl::opt<uint32_t> TinyLoopBndlAlignLimit(
+    "hexagon-tiny-loop-bundle-align-limit", cl::Hidden, cl::init(8),
+    cl::desc("Set hexagon tiny-core loop align bundle limit"));
+
+static cl::opt<uint32_t>
+    LoopEdgeThreshold("hexagon-loop-edge-threshold", cl::Hidden, cl::init(7500),
+                      cl::desc("Set hexagon loop align edge theshold"));
+
+namespace llvm {
+FunctionPass *createHexagonLoopAlign();
+void initializeHexagonLoopAlignPass(PassRegistry &);
+} // namespace llvm
+
+namespace {
+
+class HexagonLoopAlign : public MachineFunctionPass {
+  const HexagonSubtarget *HST = nullptr;
+  const TargetMachine *HTM = nullptr;
+  const HexagonInstrInfo *HII = nullptr;
+
+public:
+  static char ID;
+  HexagonLoopAlign() : MachineFunctionPass(ID) {
+    initializeHexagonLoopAlignPass(*PassRegistry::getPassRegistry());
+  }
+  bool shouldBalignLoop(MachineBasicBlock &BB, bool AboveThres);
+  bool isSingleLoop(MachineBasicBlock &MBB);
+  bool attemptToBalignSmallLoop(MachineFunction &MF, MachineBasicBlock &MBB);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineBranchProbabilityInfo>();
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return "Hexagon LoopAlign pass"; }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+char HexagonLoopAlign::ID = 0;
+
+bool HexagonLoopAlign::shouldBalignLoop(MachineBasicBlock &BB,
+                                        bool AboveThres) {
+  bool isVec = false;
+  unsigned InstCnt = 0;
+  unsigned BndlCnt = 0;
+
+  for (MachineBasicBlock::instr_iterator II = BB.instr_begin(),
+                                         IE = BB.instr_end();
+       II != IE; ++II) {
+
+    // End if the instruction is endloop.
+    if (HII->isEndLoopN(II->getOpcode()))
+      break;
+    // Count the number of bundles.
+    if (II->isBundle()) {
+      BndlCnt++;
+      continue;
+    }
+    // Skip over debug instructions.
+    if (II->isDebugInstr())
+      continue;
+    // Check if there are any HVX instructions in loop.
+    isVec |= HII->isHVXVec(*II);
+    // Count the number of instructions.
+    InstCnt++;
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "Bundle Count : " << BndlCnt << "\n";
+    dbgs() << "Instruction Count : " << InstCnt << "\n";
+  });
+
+  unsigned LimitUB = 0;
+  unsigned LimitBndl = LoopBndlAlignLimit;
+  // The conditions in the order of priority.
+  if (HST->isTinyCore()) {
+    LimitUB = TinyLoopAlignLimitUB;
+    LimitBndl = TinyLoopBndlAlignLimit;
+  } else if (isVec)
+    LimitUB = HVXLoopAlignLimitUB;
+  else if (AboveThres)
+    LimitUB = LoopAlignLimitUB;
+
+  // if the upper bound is not set to a value, implies we didn't meet
+  // the criteria.
+  if (LimitUB == 0)
+    return false;
+
+  return InstCnt >= LoopAlignLimitLB && InstCnt <= LimitUB &&
+         BndlCnt <= LimitBndl;
+}
+
+bool HexagonLoopAlign::isSingleLoop(MachineBasicBlock &MBB) {
+  int Succs = MBB.succ_size();
+  return (MBB.isSuccessor(&MBB) && (Succs == 2));
+}
+
+bool HexagonLoopAlign::attemptToBalignSmallLoop(MachineFunction &MF,
+                                                MachineBasicBlock &MBB) {
+  if (!isSingleLoop(MBB))
+    return false;
+
+  const MachineBranchProbabilityInfo *MBPI =
+      &getAnalysis<MachineBranchProbabilityInfo>();
+  const MachineBlockFrequencyInfo *MBFI =
+      &getAnalysis<MachineBlockFrequencyInfo>();
+
+  // Compute frequency of back edge,
+  BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
+  BranchProbability BrProb = MBPI->getEdgeProbability(&MBB, &MBB);
+  BlockFrequency EdgeFreq = BlockFreq * BrProb;
+  LLVM_DEBUG({
+    dbgs() << "Loop Align Pass:\n";
+    dbgs() << "\tedge with freq(" << EdgeFreq.getFrequency() << ")\n";
+  });
+
+  bool AboveThres = EdgeFreq.getFrequency() > LoopEdgeThreshold;
+  if (shouldBalignLoop(MBB, AboveThres)) {
+    // We found a loop, change its alignment to be 32 (5).
+    MBB.setAlignment(llvm::Align(1 << 5));
+    return true;
+  }
+  return false;
+}
+
+// Inspect each basic block, and if its a single BB loop, see if it
+// meets the criteria for increasing alignment to 32.
+
+bool HexagonLoopAlign::runOnMachineFunction(MachineFunction &MF) {
+
+  HST = &MF.getSubtarget<HexagonSubtarget>();
+  HII = HST->getInstrInfo();
+  HTM = &MF.getTarget();
+
+  if (skipFunction(MF.getFunction()))
+    return false;
+  if (DisableLoopAlign)
+    return false;
+
+  // This optimization is performed at
+  // i) -O2 and above, and  when the loop has a HVX instruction.
+  // ii) -O3
+  if (HST->useHVXOps()) {
+    if (HTM->getOptLevel() < CodeGenOptLevel::Default)
+      return false;
+  } else {
+    if (HTM->getOptLevel() < CodeGenOptLevel::Aggressive)
+      return false;
+  }
+
+  bool Changed = false;
+  for (MachineFunction::iterator MBBi = MF.begin(), MBBe = MF.end();
+       MBBi != MBBe; ++MBBi) {
+    MachineBasicBlock &MBB = *MBBi;
+    Changed |= attemptToBalignSmallLoop(MF, MBB);
+  }
+  return Changed;
+}
+
+} // namespace
+
+INITIALIZE_PASS(HexagonLoopAlign, "hexagon-loop-align",
+                "Hexagon LoopAlign pass", false, false)
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonLoopAlign() { return new HexagonLoopAlign(); }
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 7d77286339399d..3c346c334d6d30 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -164,6 +164,7 @@ namespace llvm {
   void initializeHexagonGenMuxPass(PassRegistry&);
   void initializeHexagonHardwareLoopsPass(PassRegistry&);
   void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &);
+  void initializeHexagonLoopAlignPass(PassRegistry &);
   void initializeHexagonNewValueJumpPass(PassRegistry&);
   void initializeHexagonOptAddrModePass(PassRegistry&);
   void initializeHexagonPacketizerPass(PassRegistry&);
@@ -194,6 +195,7 @@ namespace llvm {
   FunctionPass *createHexagonHardwareLoops();
   FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
                                      CodeGenOptLevel OptLevel);
+  FunctionPass *createHexagonLoopAlign();
   FunctionPass *createHexagonLoopRescheduling();
   FunctionPass *createHexagonNewValueJump();
   FunctionPass *createHexagonOptAddrMode();
@@ -256,8 +258,10 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
           TT, CPU, FS, Options, getEffectiveRelocModel(RM),
           getEffectiveCodeModel(CM, CodeModel::Small),
           (HexagonNoOpt ? CodeGenOptLevel::None : OL)),
-      TLOF(std::make_unique<HexagonTargetObjectFile>()) {
+      TLOF(std::make_unique<HexagonTargetObjectFile>()),
+      Subtarget(Triple(TT), CPU, FS, *this) {
   initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
+  initializeHexagonLoopAlignPass(*PassRegistry::getPassRegistry());
   initializeHexagonTfrCleanupPass(*PassRegistry::getPassRegistry());
   initAsmInfo();
 }
@@ -476,6 +480,9 @@ void HexagonPassConfig::addPreEmitPass() {
   // Packetization is mandatory: it handles gather/scatter at all opt levels.
   addPass(createHexagonPacketizer(NoOpt));
 
+  if (!NoOpt)
+    addPass(createHexagonLoopAlign());
+
   if (EnableVectorPrint)
     addPass(createHexagonVectorPrint());
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
index c5fed0cd65a814..34ff45b6acf345 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -23,6 +23,7 @@ namespace llvm {
 
 class HexagonTargetMachine : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  HexagonSubtarget Subtarget;
   mutable StringMap<std::unique_ptr<HexagonSubtarget>> SubtargetMap;
 
 public:
diff --git a/llvm/test/CodeGen/Hexagon/loop-balign.ll b/llvm/test/CodeGen/Hexagon/loop-balign.ll
new file mode 100644
index 00000000000000..9d1f42a4b14b18
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/loop-balign.ll
@@ -0,0 +1,91 @@
+; RUN: llc -march=hexagon -O3  < %s | FileCheck %s -check-prefix=BALIGN
+; BALIGN: .p2align{{.*}}5
+
+; The test for checking the alignment of 'for.body4.for.body4_crit_edge' basic block
+
+define dso_local void @foo(i32 %nCol, i32 %nRow, ptr nocapture %resMat) local_unnamed_addr {
+entry:
+  %shl = shl i32 %nRow, 2
+  %cmp36 = icmp sgt i32 %nRow, 0
+  %0 = add i32 %nCol, -1
+  %.inv = icmp slt i32 %0, 1
+  %1 = select i1 %.inv, i32 1, i32 %nCol
+  br label %Outerloop
+
+Outerloop:                                        ; preds = %for.end7, %entry
+  %r12.0 = phi i32 [ 0, %entry ], [ %inc8, %for.end7 ]
+  %r7_6.0 = phi i64 [ undef, %entry ], [ %r7_6.1.lcssa, %for.end7 ]
+  %r0i.0 = phi i32 [ undef, %entry ], [ %r0i.1.lcssa, %for.end7 ]
+  %r5.0 = phi ptr [ %resMat, %entry ], [ %r5.1.lcssa, %for.end7 ]
+  %r8.0 = phi i32 [ %shl, %entry ], [ %r8.1.lcssa, %for.end7 ]
+  br i1 %cmp36, label %for.body.lr.ph, label %for.end7
+
+for.body.lr.ph:                                   ; preds = %Outerloop
+  %cmp332 = icmp eq i32 %r12.0, 0
+  %exitcond.peel = icmp eq i32 %r12.0, 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.end, %for.body.lr.ph
+  %r8.141 = phi i32 [ %r8.0, %for.body.lr.ph ], [ %add, %for.end ]
+  %r5.140 = phi ptr [ %r5.0, %for.body.lr.ph ], [ %add.ptr, %for.end ]
+  %i.039 = phi i32 [ 0, %for.body.lr.ph ], [ %inc6, %for.end ]
+  %r0i.138 = phi i32 [ %r0i.0, %for.body.lr.ph ], [ %4, %for.end ]
+  %r7_6.137 = phi i64 [ %r7_6.0, %for.body.lr.ph ], [ %r7_6.2.lcssa, %for.end ]
+  %add = add nsw i32 %r8.141, %shl
+  br i1 %cmp332, label %for.end, label %for.body4.peel
+
+for.body4.peel:                                   ; preds = %for.body
+  %r1i.0.in.peel = inttoptr i32 %r8.141 to ptr
+  %r1i.0.peel = load i32, ptr %r1i.0.in.peel, align 4
+  %2 = tail call i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64 %r7_6.137, i32 %r1i.0.peel, i32 %r0i.138)
+  br i1 %exitcond.peel, label %for.end, label %for.body4.preheader.peel.newph
+
+for.body4.preheader.peel.newph:                   ; preds = %for.body4.peel
+  %r1i.0.in = inttoptr i32 %add to ptr
+  %r1i.0 = load i32, ptr %r1i.0.in, align 4
+  br label %for.body4
+
+for.body4:                                        ; preds = %for.body4.for.body4_crit_edge, %for.body4.preheader.peel.newph
+  %inc.phi = phi i32 [ %inc.0, %for.body4.for.body4_crit_edge ], [ 2, %for.body4.preheader.peel.newph ]
+  %r7_6.233 = phi i64 [ %3, %for.body4.for.body4_crit_edge ], [ %2, %for.body4.preheader.peel.newph ]
+  %3 = tail call i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64 %r7_6.233, i32 %r1i.0, i32 %r0i.138)
+  %exitcond = icmp eq i32 %inc.phi, %r12.0
+  br i1 %exitcond, label %for.end.loopexit, label %for.body4.for.body4_crit_edge
+
+for.body4.for.body4_crit_edge:                    ; preds = %for.body4
+  %inc.0 = add nuw nsw i32 %inc.phi, 1
+  br label %for.body4
+
+for.end.loopexit:                                 ; preds = %for.body4
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %for.body4.peel, %for.body
+  %r7_6.2.lcssa = phi i64 [ %r7_6.137, %for.body ], [ %2, %for.body4.peel ], [ %3, %for.end.loopexit ]
+  %4 = tail call i32 @llvm.hexagon.S2.clbp(i64 %r7_6.2.lcssa)
+  store i32 %4, ptr %r5.140, align 4
+  %add.ptr = getelementptr inbounds i8, ptr %r5.140, i32 undef
+  %inc6 = add nuw nsw i32 %i.039, 1
+  %exitcond47 = icmp eq i32 %inc6, %nRow
+  br i1 %exitcond47, label %for.end7.loopexit, label %for.body
+
+for.end7.loopexit:                                ; preds = %for.end
+  br label %for.end7
+
+for.end7:                                         ; preds = %for.end7.loopexit, %Outerloop
+  %r7_6.1.lcssa = phi i64 [ %r7_6.0, %Outerloop ], [ %r7_6.2.lcssa, %for.end7.loopexit ]
+  %r0i.1.lcssa = phi i32 [ %r0i.0, %Outerloop ], [ %4, %for.end7.loopexit ]
+  %r5.1.lcssa = phi ptr [ %r5.0, %Outerloop ], [ %add.ptr, %for.end7.loopexit ]
+  %r8.1.lcssa = phi i32 [ %r8.0, %Outerloop ], [ %add, %for.end7.loopexit ]
+  %inc8 = add nuw i32 %r12.0, 1
+  %exitcond48 = icmp eq i32 %inc8, %1
+  br i1 %exitcond48, label %if.end, label %Outerloop
+
+if.end:                                           ; preds = %for.end7
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64, i32, i32) 
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.S2.clbp(i64)
diff --git a/llvm/test/CodeGen/Hexagon/loop_align_count.ll b/llvm/test/CodeGen/Hexagon/loop_align_count.ll
new file mode 100644
index 00000000000000..07d7e4a8d61176
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/loop_align_count.ll
@@ -0,0 +1,115 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv73 -O2 -mattr=+hvxv73,hvx-length64b \
+; RUN: -debug-only=hexagon-loop-align 2>&1 < %s | FileCheck %s
+; Validate that there are 4 bundles in the loop.
+
+; CHECK: Loop Align Pass:
+; CHECK: Bundle Count : 4
+; CHECK: .p2align{{.*}}5
+
+; Function Attrs: nounwind
+define void @ham(ptr noalias nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3, ptr noalias nocapture %arg4, i32 %arg5) #0 {
+bb:
+  %ashr = ashr i32 %arg3, 2
+  %ashr6 = ashr i32 %arg3, 1
+  %add = add nsw i32 %ashr6, %ashr
+  %icmp = icmp sgt i32 %arg2, 0
+  br i1 %icmp, label %bb7, label %bb61
+
+bb7:                                              ; preds = %bb
+  %sdiv = sdiv i32 %arg1, 64
+  %icmp8 = icmp sgt i32 %arg1, 63
+  br label %bb9
+
+bb9:                                              ; preds = %bb57, %bb7
+  %phi = phi i32 [ 0, %bb7 ], [ %add58, %bb57 ]
+  %ashr10 = ashr exact i32 %phi, 1
+  %mul = mul nsw i32 %ashr10, %arg3
+  br i1 %icmp8, label %bb11, label %bb57
+
+bb11:                                             ; preds = %bb9
+  %add12 = add nsw i32 %phi, 1
+  %mul13 = mul nsw i32 %add12, %arg5
+  %mul14 = mul nsw i32 %phi, %arg5
+  %add15 = add i32 %add, %mul
+  %add16 = add i32 %mul, %ashr
+  %add17 = add i32 %mul, %ashr6
+  %getelementptr = getelementptr inbounds i8, ptr %arg4, i32 %mul13
+  %getelementptr18 = getelementptr inbounds i8, ptr %arg4, i32 %mul14
+  %getelementptr19 = getelementptr inbounds i16, ptr %arg, i32 %add15
+  %getelementptr20 = getelementptr inbounds i16, ptr %arg, i32 %add16
+  %getelementptr21 = getelementptr inbounds i16, ptr %arg, i32 %add17
+  %getelementptr22 = getelementptr inbounds i16, ptr %arg, i32 %mul
+  %bitcast = bitcast ptr %getelementptr to ptr
+  %bitcast23 = bitcast ptr %getelementptr18 to ptr
+  %bitcast24 = bitcast ptr %getelementptr19 to ptr
+  %bitcast25 = bitcast ptr %getelementptr20 to ptr
+  %bitcast26 = bitcast ptr %getelementptr21 to ptr
+  %bitcast27 = bitcast ptr %getelementptr22 to ptr
+  br label %bb28
+
+bb28:                                             ; preds = %bb28, %bb11
+  %phi29 = phi i32 [ 0, %bb11 ], [ %add54, %bb28 ]
+  %phi30 = phi ptr [ %bitcast27, %bb11 ], [ %getelementptr36, %bb28 ]
+  %phi31 = phi ptr [ %bitcast26, %bb11 ], [ %getelementptr37, %bb28 ]
+  %phi32 = phi ptr [ %bitcast25, %bb11 ], [ %getelementptr39, %bb28 ]
+  %phi33 = phi ptr [ %bitcast24, %bb11 ], [ %getelementptr41, %bb28 ]
+  %phi34 = phi ptr [ %bitcast, %bb11 ], [ %getelementptr53, %bb28 ]
+  %phi35 = phi ptr [ %bitcast23, %bb11 ], [ %getelementptr52, %bb28 ]
+  %getelementptr36 = getelementptr inbounds <16 x i32>, ptr %phi30, i32 1
+  %load = load <16 x i32>, ptr %phi30, align 64
+  %getelementptr37 = getelementptr inbounds <16 x i32>, ptr %phi31, i32 1
+  %load38 = load <16 x i32>, ptr %phi31, align 64
+  %getelementptr39 = getelementptr inbounds <16 x i32>, ptr %phi32, i32 1
+  %load40 = load <16 x i32>, ptr %phi32, align 64
+  %getelementptr41 = getelementptr inbounds <16 x i32>, ptr %phi33, i32 1
+  %load42 = load <16 x i32>, ptr %phi33, align 64
+  %call = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load, <16 x i32> %load38)
+  %call43 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load, <16 x i32> %load38)
+  %call44 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load40, <16 x i32> %load42)
+  %call45 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load40, <16 x i32> %load42)
+  %call46 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call, <16 x i32> %call44)
+  %call47 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call, <16 x i32> %call44)
+  %call48 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call43, <16 x i32> %call45)
+  %call49 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call43, <16 x i32> %call45)
+  %call50 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call47, <16 x i32> %call46)
+  %call51 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call49, <16 x i32> %call48)
+  %getelementptr52 = getelementptr inbounds <16 x i32>, ptr %phi35, i32 ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/83379