[llvm] [PowerPC][MachineLICM] Aggressively hoist CP loads with PC-Rel (PR #133313)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 27 13:58:02 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-powerpc
Author: Lei Huang (lei137)
<details>
<summary>Changes</summary>
With PC-Relative addressing, constant pool loads become prefixed. In addition to possible cache misses, prefixed loads have instruction dispatch effects that can cause significant performance degradations. As such, we want to be rather aggressive with LICM of such CP loads.
Patch by @<!-- -->nemanjai and @<!-- -->chenzheng1030
---
Full diff: https://github.com/llvm/llvm-project/pull/133313.diff
5 Files Affected:
- (modified) llvm/include/llvm/CodeGen/TargetLowering.h (+4)
- (modified) llvm/lib/CodeGen/MachineLICM.cpp (+3-2)
- (modified) llvm/lib/Target/PowerPC/PPCISelLowering.cpp (+13)
- (modified) llvm/lib/Target/PowerPC/PPCISelLowering.h (+2)
- (added) llvm/test/CodeGen/PowerPC/hoist-cp-loads.ll (+98)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 58ac87206b9a6..f012366321f4d 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3450,6 +3450,10 @@ class TargetLoweringBase {
return isOperationLegalOrCustom(Op, VT);
}
+ /// Return true if invariant loads should be aggressively hoisted out of loops
+ /// even if the loop body might not execute.
+ virtual bool aggressivelyHoistInvariantLoads() const { return false; }
+
/// Should we expand [US]CMP nodes using two selects and two compares, or by
/// doing arithmetic on boolean types
virtual bool shouldExpandCmpUsingSelects(EVT VT) const { return false; }
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 1ac1a770ae72f..7321184bf067f 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1351,10 +1351,12 @@ bool MachineLICMImpl::IsProfitableToHoist(MachineInstr &MI,
return false;
}
+ bool InvariantLoad = MI.isDereferenceableInvariantLoad();
// Do not "speculate" in high register pressure situation. If an
// instruction is not guaranteed to be executed in the loop, it's best to be
// conservative.
if (AvoidSpeculation &&
+ (!InvariantLoad || !TLI->aggressivelyHoistInvariantLoads()) &&
(!IsGuaranteedToExecute(MI.getParent(), CurLoop) && !MayCSE(&MI))) {
LLVM_DEBUG(dbgs() << "Won't speculate: " << MI);
return false;
@@ -1393,8 +1395,7 @@ bool MachineLICMImpl::IsProfitableToHoist(MachineInstr &MI,
// High register pressure situation, only hoist if the instruction is going
// to be remat'ed.
- if (!isTriviallyReMaterializable(MI) &&
- !MI.isDereferenceableInvariantLoad()) {
+ if (!isTriviallyReMaterializable(MI) && !InvariantLoad) {
LLVM_DEBUG(dbgs() << "Can't remat / high reg-pressure: " << MI);
return false;
}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index ab78f33f5a630..8bd8ce7e4fb92 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -125,6 +125,11 @@ cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);
static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",
cl::desc("use absolute jump tables on ppc"), cl::Hidden);
+static cl::opt<bool> DisableAggressiveCPLHoist(
+ "disable-aggressive-cpl-hoist",
+ cl::desc("aggressively hoist constant pool loads out of loops"),
+ cl::Hidden);
+
static cl::opt<bool>
DisablePerfectShuffle("ppc-disable-perfect-shuffle",
cl::desc("disable vector permute decomposition"),
@@ -18677,6 +18682,14 @@ isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
return true;
}
+// Constant pool loads in loops cause a major performance degradation with
+// prefixed pc-relative loads so move them out aggressively.
+// FIXME: This may be useful with TOC-based CP loads under some conditions
+// but initial performance testing did not produce conclusive evidence.
+bool PPCTargetLowering::aggressivelyHoistInvariantLoads() const {
+ return Subtarget.isUsingPCRelativeCalls() && !DisableAggressiveCPLHoist;
+}
+
/// getAddrModeForFlags - Based on the set of address flags, select the most
/// optimal instruction format to match by.
PPC::AddrMode PPCTargetLowering::getAddrModeForFlags(unsigned Flags) const {
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 1f22aa16a89be..bc36085f564fa 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1111,6 +1111,8 @@ namespace llvm {
// Keep the zero-extensions for arguments to libcalls.
bool shouldKeepZExtForFP16Conv() const override { return true; }
+ bool aggressivelyHoistInvariantLoads() const override;
+
/// createFastISel - This method returns a target-specific FastISel object,
/// or null if the target does not support "fast" instruction selection.
FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
diff --git a/llvm/test/CodeGen/PowerPC/hoist-cp-loads.ll b/llvm/test/CodeGen/PowerPC/hoist-cp-loads.ll
new file mode 100644
index 0000000000000..325d3ec2219e7
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/hoist-cp-loads.ll
@@ -0,0 +1,98 @@
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O3 \
+; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN: -stop-before=greedy < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O3 \
+; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN: -disable-aggressive-cpl-hoist \
+; RUN: -stop-before=greedy < %s | FileCheck %s --check-prefix=NOHOIST
+
+;; This test checks that when register pressure is high, -disable-aggressive-cpl-hoist will control
+;; the hoist of the constant loads.
+;; There are 34 PLXVpc LICM candicates, the first 32 PLXVpc will always be hoisted
+;; because the register pressure is low. But the left 2 PLXVpc will be controled
+;; by -disable-aggressive-cpl-hoist option. By default, these 2 PLXVpc will be hoisted too.
+;; With -disable-aggressive-cpl-hoist, these 2 PLXVpc will be kept inside the loop.
+
+; CHECK: name: test
+; CHECK: for.body.preheader:
+; CHECK: MTCTR8loop
+; CHECK-COUNT: PLXVpc 34
+; CHECK: B %bb.
+; CHECK: for.body:
+; CHECK-NO: PLXVpc
+;
+; NOHOIST: name: test
+; NOHOIST: for.body.preheader:
+; NOHOIST: MTCTR8loop
+; NOHOIST-COUNT: PLXVpc 32
+; NOHOIST: B %bb.
+; NOHOIST: for.body:
+; NOHOIST-COUNT: PLXVpc 2
+
+define void @test(ptr %array, ptr readonly %indicate, i32 signext %count) nounwind {
+entry:
+ %cmp24 = icmp sgt i32 %count, 0
+ br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext nneg i32 %count to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.inc, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %arrayidx4 = getelementptr inbounds i8, ptr %indicate, i64 %indvars.iv
+ %ind = load i8, ptr %arrayidx4, align 1
+ %cmp1 = icmp ugt i8 %ind, 10
+ %arrayidx = getelementptr inbounds i8, ptr %array, i64 %indvars.iv
+ %0 = load <16 x i16>, ptr %arrayidx, align 1
+ br i1 %cmp1, label %if.then, label %if.else
+
+if.then: ; preds = %for.body
+ %add = add <16 x i16> %0, <i16 1, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 1>
+ store volatile <16 x i16> %add, ptr %arrayidx, align 32
+ %add2 = add <16 x i16> %0, <i16 2, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 2>
+ store volatile <16 x i16> %add2, ptr %arrayidx, align 32
+ %add3 = add <16 x i16> %0, <i16 3, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 3>
+ store volatile <16 x i16> %add3, ptr %arrayidx, align 32
+ %add4 = add <16 x i16> %0, <i16 4, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 4>
+ store volatile <16 x i16> %add4, ptr %arrayidx, align 32
+ %add5 = add <16 x i16> %0, <i16 5, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 5>
+ store volatile <16 x i16> %add5, ptr %arrayidx, align 32
+ %add6 = add <16 x i16> %0, <i16 6, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 6>
+ store volatile <16 x i16> %add6, ptr %arrayidx, align 32
+ %add7 = add <16 x i16> %0, <i16 7, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 7>
+ store volatile <16 x i16> %add7, ptr %arrayidx, align 32
+ %add8 = add <16 x i16> %0, <i16 8, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 8>
+ store volatile <16 x i16> %add8, ptr %arrayidx, align 32
+ %add9 = add <16 x i16> %0, <i16 9, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 9>
+ store volatile <16 x i16> %add9, ptr %arrayidx, align 32
+ br label %for.inc
+
+if.else: ; preds = %for.body
+ %add10 = add <16 x i16> %0, <i16 10, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 10>
+ store volatile <16 x i16> %add10, ptr %arrayidx, align 32
+ %add11 = add <16 x i16> %0, <i16 11, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 11>
+ store volatile <16 x i16> %add11, ptr %arrayidx, align 32
+ %add12 = add <16 x i16> %0, <i16 12, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 12>
+ store volatile <16 x i16> %add12, ptr %arrayidx, align 32
+ %add13 = add <16 x i16> %0, <i16 13, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 13>
+ store volatile <16 x i16> %add13, ptr %arrayidx, align 32
+ %add14 = add <16 x i16> %0, <i16 14, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 14>
+ store volatile <16 x i16> %add14, ptr %arrayidx, align 32
+ %add15 = add <16 x i16> %0, <i16 15, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 15>
+ store volatile <16 x i16> %add15, ptr %arrayidx, align 32
+ %add16 = add <16 x i16> %0, <i16 16, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16>
+ store volatile <16 x i16> %add16, ptr %arrayidx, align 32
+ %add17 = add <16 x i16> %0, <i16 17, i16 2, i16 3, i16 4, i16 5,i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 17>
+ store volatile <16 x i16> %add17, ptr %arrayidx, align 32
+
+ br label %for.inc
+
+for.inc: ; preds = %if.then, %if.else
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/133313
More information about the llvm-commits
mailing list