[llvm] [PowerPC][MachineLICM] Aggressively hoist CP loads with PC-Rel (PR #133313)

Lei Huang via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 27 13:57:23 PDT 2025


https://github.com/lei137 created https://github.com/llvm/llvm-project/pull/133313

With PC-Relative addressing, constant pool loads become prefixed. In addition to possible cache misses, prefixed loads have instruction dispatch effects that can cause significant performance degradations. As such, we want to be rather aggressive with LICM of such CP loads.

Patch by @nemanjai and @chenzheng1030 

>From ff5d215544816349f3771126a15c6bf0d4e2b9ca Mon Sep 17 00:00:00 2001
From: Lei Huang <lei at ca.ibm.com>
Date: Thu, 27 Mar 2025 16:51:28 -0400
Subject: [PATCH] [PowerPC][MachineLICM] Aggressively hoist CP loads with
 PC-Rel

With PC-Relative addressing, constant pool loads become prefixed.
In addition to possible cache misses, prefixed loads have instruction
dispatch effects that can cause significant performance degradations.
As such, we want to be rather aggressive with LICM of such CP loads.
---
 llvm/include/llvm/CodeGen/TargetLowering.h  |  4 +
 llvm/lib/CodeGen/MachineLICM.cpp            |  5 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 13 +++
 llvm/lib/Target/PowerPC/PPCISelLowering.h   |  2 +
 llvm/test/CodeGen/PowerPC/hoist-cp-loads.ll | 98 +++++++++++++++++++++
 5 files changed, 120 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/hoist-cp-loads.ll

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 58ac87206b9a6..f012366321f4d 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3450,6 +3450,10 @@ class TargetLoweringBase {
     return isOperationLegalOrCustom(Op, VT);
   }
 
+  /// Return true if invariant loads should be aggressively hoisted out of loops
+  /// even if the loop body might not execute.
+  virtual bool aggressivelyHoistInvariantLoads() const { return false; }
+
   /// Should we expand [US]CMP nodes using two selects and two compares, or by
   /// doing arithmetic on boolean types
   virtual bool shouldExpandCmpUsingSelects(EVT VT) const { return false; }
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 1ac1a770ae72f..7321184bf067f 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1351,10 +1351,12 @@ bool MachineLICMImpl::IsProfitableToHoist(MachineInstr &MI,
     return false;
   }
 
+  bool InvariantLoad = MI.isDereferenceableInvariantLoad();
   // Do not "speculate" in high register pressure situation. If an
   // instruction is not guaranteed to be executed in the loop, it's best to be
   // conservative.
   if (AvoidSpeculation &&
+      (!InvariantLoad || !TLI->aggressivelyHoistInvariantLoads()) &&
       (!IsGuaranteedToExecute(MI.getParent(), CurLoop) && !MayCSE(&MI))) {
     LLVM_DEBUG(dbgs() << "Won't speculate: " << MI);
     return false;
@@ -1393,8 +1395,7 @@ bool MachineLICMImpl::IsProfitableToHoist(MachineInstr &MI,
 
   // High register pressure situation, only hoist if the instruction is going
   // to be remat'ed.
-  if (!isTriviallyReMaterializable(MI) &&
-      !MI.isDereferenceableInvariantLoad()) {
+  if (!isTriviallyReMaterializable(MI) && !InvariantLoad) {
     LLVM_DEBUG(dbgs() << "Can't remat / high reg-pressure: " << MI);
     return false;
   }
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index ab78f33f5a630..8bd8ce7e4fb92 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -125,6 +125,11 @@ cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);
 static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",
 cl::desc("use absolute jump tables on ppc"), cl::Hidden);
 
+static cl::opt<bool> DisableAggressiveCPLHoist(
+    "disable-aggressive-cpl-hoist",
+    cl::desc("aggressively hoist constant pool loads out of loops"),
+    cl::Hidden);
+
 static cl::opt<bool>
     DisablePerfectShuffle("ppc-disable-perfect-shuffle",
                           cl::desc("disable vector permute decomposition"),
@@ -18677,6 +18682,14 @@ isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
   return true;
 }
 
+// Constant pool loads in loops cause a major performance degradation with
+// prefixed pc-relative loads so move them out aggressively.
+// FIXME: This may be useful with TOC-based CP loads under some conditions
+// but initial performance testing did not produce conclusive evidence.
+bool PPCTargetLowering::aggressivelyHoistInvariantLoads() const {
+  return Subtarget.isUsingPCRelativeCalls() && !DisableAggressiveCPLHoist;
+}
+
 /// getAddrModeForFlags - Based on the set of address flags, select the most
 /// optimal instruction format to match by.
 PPC::AddrMode PPCTargetLowering::getAddrModeForFlags(unsigned Flags) const {
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 1f22aa16a89be..bc36085f564fa 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1111,6 +1111,8 @@ namespace llvm {
     // Keep the zero-extensions for arguments to libcalls.
     bool shouldKeepZExtForFP16Conv() const override { return true; }
 
+    bool aggressivelyHoistInvariantLoads() const override;
+
     /// createFastISel - This method returns a target-specific FastISel object,
     /// or null if the target does not support "fast" instruction selection.
     FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
diff --git a/llvm/test/CodeGen/PowerPC/hoist-cp-loads.ll b/llvm/test/CodeGen/PowerPC/hoist-cp-loads.ll
new file mode 100644
index 0000000000000..325d3ec2219e7
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/hoist-cp-loads.ll
@@ -0,0 +1,98 @@
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O3 \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -stop-before=greedy < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O3 \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -disable-aggressive-cpl-hoist \
+; RUN:   -stop-before=greedy < %s | FileCheck %s --check-prefix=NOHOIST
+
+;; This test checks that when register pressure is high, -disable-aggressive-cpl-hoist will control
+;; the hoist of the constant loads. 
+;; There are 34 PLXVpc LICM candicates, the first 32 PLXVpc will always be hoisted
+;; because the register pressure is low. But the left 2 PLXVpc will be controled
+;; by -disable-aggressive-cpl-hoist option. By default, these 2 PLXVpc will be hoisted too.
+;; With -disable-aggressive-cpl-hoist, these 2 PLXVpc will be kept inside the loop.
+
+; CHECK:   name: test
+; CHECK:         for.body.preheader:
+; CHECK:           MTCTR8loop
+; CHECK-COUNT:     PLXVpc 34
+; CHECK:           B %bb.
+; CHECK:         for.body:
+; CHECK-NO:        PLXVpc
+;
+; NOHOIST:   name: test
+; NOHOIST:         for.body.preheader:
+; NOHOIST:           MTCTR8loop
+; NOHOIST-COUNT:     PLXVpc 32
+; NOHOIST:           B %bb.
+; NOHOIST:         for.body:
+; NOHOIST-COUNT:     PLXVpc 2
+
+define void @test(ptr %array, ptr readonly %indicate, i32 signext %count) nounwind {
+entry:
+  %cmp24 = icmp sgt i32 %count, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext nneg i32 %count to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx4 = getelementptr inbounds i8, ptr %indicate, i64 %indvars.iv
+  %ind = load i8, ptr %arrayidx4, align 1
+  %cmp1 = icmp ugt i8 %ind, 10
+  %arrayidx = getelementptr inbounds i8, ptr %array, i64 %indvars.iv
+  %0 = load <16 x i16>, ptr %arrayidx, align 1
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %add = add <16 x i16> %0, <i16 1, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 1>
+  store volatile <16 x i16> %add, ptr %arrayidx, align 32
+  %add2 = add <16 x i16> %0, <i16 2, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 2>
+  store volatile <16 x i16> %add2, ptr %arrayidx, align 32
+  %add3 = add <16 x i16> %0, <i16 3, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 3>
+  store volatile <16 x i16> %add3, ptr %arrayidx, align 32
+  %add4 = add <16 x i16> %0, <i16 4, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 4>
+  store volatile <16 x i16> %add4, ptr %arrayidx, align 32
+  %add5 = add <16 x i16> %0, <i16 5, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 5>
+  store volatile <16 x i16> %add5, ptr %arrayidx, align 32
+  %add6 = add <16 x i16> %0, <i16 6, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 6>
+  store volatile <16 x i16> %add6, ptr %arrayidx, align 32
+  %add7 = add <16 x i16> %0, <i16 7, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 7>
+  store volatile <16 x i16> %add7, ptr %arrayidx, align 32
+  %add8 = add <16 x i16> %0, <i16 8, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 8>
+  store volatile <16 x i16> %add8, ptr %arrayidx, align 32
+  %add9 = add <16 x i16> %0, <i16 9, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 9>
+  store volatile <16 x i16> %add9, ptr %arrayidx, align 32
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %add10 = add <16 x i16> %0, <i16 10, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 10>
+  store volatile <16 x i16> %add10, ptr %arrayidx, align 32
+  %add11 = add <16 x i16> %0, <i16 11, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 11>
+  store volatile <16 x i16> %add11, ptr %arrayidx, align 32
+  %add12 = add <16 x i16> %0, <i16 12, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 12>
+  store volatile <16 x i16> %add12, ptr %arrayidx, align 32
+  %add13 = add <16 x i16> %0, <i16 13, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 13>
+  store volatile <16 x i16> %add13, ptr %arrayidx, align 32
+  %add14 = add <16 x i16> %0, <i16 14, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 14>
+  store volatile <16 x i16> %add14, ptr %arrayidx, align 32
+  %add15 = add <16 x i16> %0, <i16 15, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15>
+  store volatile <16 x i16> %add15, ptr %arrayidx, align 32
+  %add16 = add <16 x i16> %0, <i16 16, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16>
+  store volatile <16 x i16> %add16, ptr %arrayidx, align 32
+  %add17 = add <16 x i16> %0, <i16 17, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 17>
+  store volatile <16 x i16> %add17, ptr %arrayidx, align 32
+
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.else
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}



More information about the llvm-commits mailing list