[llvm] [MachineCSE] Enhance MachineCSE simple PRE to find common subexpressi… (PR #129860)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 13 07:56:06 PDT 2025
https://github.com/emelliu updated https://github.com/llvm/llvm-project/pull/129860
>From 7b8770fe19a07630dc8c012be082bde6838a4f3a Mon Sep 17 00:00:00 2001
From: "xiangyou.liu" <xiangyou.liu at iluvatar.com>
Date: Wed, 26 Feb 2025 14:25:44 +0800
Subject: [PATCH] [MachineCSE] Enhance MachineCSE simple PRE to find common
subexpression completely (#129516).
When IF-THEN-ELSE in a loop, THEN and ELSE bb may have multiple MIs subexpression, but now
PRE process just can find one MI and hoist, this may result in increasing many temp registers
which liverange is from CMBB to THEN-ELSE BBs, this may cause register pressure.
This patch try to find all MIs in common subexpression and hoist to CMBB in PRE process.
Signed-off-by: xiangyou.liu <xiangyou.liu at iluvatar.com>
---
llvm/lib/CodeGen/MachineCSE.cpp | 44 ++++++
.../CodeGen/AMDGPU/not-cse-completely.mir | 141 ++++++++++++++++++
2 files changed, 185 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/not-cse-completely.mir
diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp
index 6d14509c5934f..0566f1bc94341 100644
--- a/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/llvm/lib/CodeGen/MachineCSE.cpp
@@ -30,6 +30,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/Register.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -97,6 +98,7 @@ class MachineCSEImpl {
DenseMap<MachineBasicBlock *, ScopeType *> ScopeMap;
DenseMap<MachineInstr *, MachineBasicBlock *, MachineInstrExpressionTrait>
PREMap;
+ DenseMap<MachineInstr *, std::pair<Register, Register>> InstRegChangedMap;
ScopedHTType VNT;
SmallVector<MachineInstr *, 64> Exps;
unsigned CurrVN = 0;
@@ -125,6 +127,7 @@ class MachineCSEImpl {
bool isPRECandidate(MachineInstr *MI, SmallSet<MCRegister, 8> &PhysRefs);
bool ProcessBlockPRE(MachineDominatorTree *MDT, MachineBasicBlock *MBB);
bool PerformSimplePRE(MachineDominatorTree *DT);
+ void RecoverRegChangedInst();
/// Heuristics to see if it's profitable to move common computations of MBB
/// and MBB1 to CandidateBB.
bool isProfitableToHoistInto(MachineBasicBlock *CandidateBB,
@@ -887,6 +890,26 @@ bool MachineCSEImpl::ProcessBlockPRE(MachineDominatorTree *DT,
NewMI.setDebugLoc(EmptyDL);
NewMI.getOperand(0).setReg(NewReg);
+ for (MachineOperand &MO :
+ llvm::make_early_inc_range(MRI->use_nodbg_operands(VReg))) {
+ if (MO.isUse() &&
+ MO.getParent()->getParent()->getName() == MBB->getName()) {
+ InstRegChangedMap[MO.getParent()] =
+ std::make_pair(MO.getReg(), NewReg);
+ MO.setReg(NewReg);
+ }
+ }
+ auto *SiblingBBMI = PREMap.try_emplace(&MI).first->getFirst();
+ for (MachineOperand &MO :
+ llvm::make_early_inc_range(MRI->use_nodbg_operands(
+ SiblingBBMI->getOperand(0).getReg()))) {
+ if (MO.isUse() && PREMap.count(MO.getParent())) {
+ MachineInstr *UseMI = MO.getParent();
+ InstRegChangedMap[UseMI] = std::make_pair(MO.getReg(), NewReg);
+ MO.setReg(NewReg);
+ PREMap[UseMI] = UseMI->getParent();
+ }
+ }
PREMap[&MI] = CMBB;
++NumPREs;
@@ -897,6 +920,23 @@ bool MachineCSEImpl::ProcessBlockPRE(MachineDominatorTree *DT,
return Changed;
}
+// For finding subexpression completely, original mir maybe be changed,
+// this may cause some risk if the redundant MI can't be eliminated later.
+// So when leave from PRE we need recover MI operands to original state,
+// just left redundant common subexpression MIs in CMBB.
+void MachineCSEImpl::RecoverRegChangedInst() {
+ for (auto Item : InstRegChangedMap) {
+ MachineInstr *Inst = Item.getFirst();
+ Register OriginReg = Item.getSecond().first;
+ Register NewReg = Item.getSecond().second;
+ for (MachineOperand &MO : Inst->operands()) {
+ if (MO.isReg() && MO.isUse() && MO.getReg() == NewReg) {
+ MO.setReg(OriginReg);
+ }
+ }
+ }
+}
+
// This simple PRE (partial redundancy elimination) pass doesn't actually
// eliminate partial redundancy but transforms it to full redundancy,
// anticipating that the next CSE step will eliminate this created redundancy.
@@ -906,6 +946,8 @@ bool MachineCSEImpl::PerformSimplePRE(MachineDominatorTree *DT) {
SmallVector<MachineDomTreeNode *, 32> BBs;
PREMap.clear();
+ InstRegChangedMap.clear();
+
bool Changed = false;
BBs.push_back(DT->getRootNode());
do {
@@ -917,6 +959,8 @@ bool MachineCSEImpl::PerformSimplePRE(MachineDominatorTree *DT) {
} while (!BBs.empty());
+ RecoverRegChangedInst();
+
return Changed;
}
diff --git a/llvm/test/CodeGen/AMDGPU/not-cse-completely.mir b/llvm/test/CodeGen/AMDGPU/not-cse-completely.mir
new file mode 100644
index 0000000000000..8b1b11c55fa18
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/not-cse-completely.mir
@@ -0,0 +1,141 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -run-pass=machine-cse -o - %s | FileCheck %s
+
+# MachineCSE pass can't hoist common subexpression completely in test case, this may produce more temporary
+# variable using from CMBB to then-BB and else-BB, and increase register pressure.
+# If we enhance SimplePRE and try to hoist subexpression completely, and eliminate redundant MI in CSE prcess.
+# this problem would be solved.
+
+# This test case has a while-loop with one if-else, the meaning of while-loop exsiting is
+# for meeting then-bb and else-bb are Potentially Reachable.
+# then-bb and else-bb have common subexpression, %8 are from predecessor.
+#
+# %6:vreg_64 = V_ASHR_I64_e64 %0, %1, implicit $exec
+# %7:vgpr_32 = V_MUL_I32_I24_e32 4, %6.sub0, implicit $exec
+# %8:vgpr_32 = V_SUB_I32_e64 %6.sub1, %7, 0, implicit $exec
+#
+# Before this patch Machine-CSE if just hoist first and second MIs, and maybe hoisted V_MUL_I32_I24_e32 is also
+# following other MIs in CMBB, so from CMBB to then-bb or else-bb the register %6 and %7's liverange would be
+# more longer, it increases register pressure.
+
+
+--- |
+ define amdgpu_kernel void @not_cse_completely(ptr addrspace(1), i32 %a, i1) {
+ entry:
+ br label %while.cond
+ while.cond:
+ %cmp = trunc i32 %a to i1
+ br i1 %cmp, label %if.then, label %if.else
+ if.then:
+ br label %if.end
+ if.else:
+ br label %if.end
+ if.end:
+ %cmp2 = trunc i32 %a to i1
+ br i1 %cmp, label %while.cond, label %while.end
+ while.end:
+ ret void
+ }
+...
+---
+name: not_cse_completely
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: not_cse_completely
+ ; CHECK: bb.0.entry:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1, $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1.while.cond:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 42
+ ; CHECK-NEXT: S_CMP_EQ_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[V_ASHR_I64_e64_:%[0-9]+]]:vreg_64 = V_ASHR_I64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; CHECK-NEXT: [[V_MUL_I32_I24_e32_:%[0-9]+]]:vgpr_32 = V_MUL_I32_I24_e32 4, [[V_ASHR_I64_e64_]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_SUB_I32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_I32_e64 [[V_ASHR_I64_e64_]].sub1, [[V_MUL_I32_I24_e32_]], 0, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2.if.then:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_MAX_I32_e32_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e32 [[V_SUB_I32_e64_]], [[COPY2]], implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3.if.else:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_MIN_I32_e32_:%[0-9]+]]:vgpr_32 = V_MIN_I32_e32 [[V_SUB_I32_e64_]], [[COPY3]], implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4.if.end:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MAX_I32_e32_]], %bb.2, [[V_MIN_I32_e32_]], %bb.3
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[COPY]]
+ ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY5]], [[PHI]], 0, 0, implicit $exec, implicit $flat_scr
+ ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[PHI]], 2, implicit $exec
+ ; CHECK-NEXT: S_CMP_EQ_U32 [[V_CMP_EQ_U32_e64_]].sub0, 10, implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
+ ; CHECK-NEXT: S_BRANCH %bb.5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.5.while.end:
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0.entry:
+ successors: %bb.1(0x80000000)
+ liveins: $sgpr0_sgpr1, $vgpr0, $vgpr1, $vgpr2
+
+ %0:sgpr_64 = COPY $sgpr0_sgpr1
+ %1:vgpr_32 = COPY $vgpr0
+ %2:vgpr_32 = COPY $vgpr1
+ %3:vgpr_32 = COPY $vgpr2
+ S_BRANCH %bb.1
+
+ bb.1.while.cond:
+ successors: %bb.3(0x40000000), %bb.2(0x40000000)
+
+ %4:sreg_32 = COPY %0.sub1
+ %5:sreg_32 = S_MOV_B32 42
+ S_CMP_EQ_U32 %4, %5, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.3, implicit $scc
+ S_BRANCH %bb.2
+
+ bb.2.if.then:
+ successors: %bb.4(0x80000000)
+
+ %6:vreg_64 = V_ASHR_I64_e64 %0, %1, implicit $exec
+ %7:vgpr_32 = V_MUL_I32_I24_e32 4, %6.sub0, implicit $exec
+ %8:vgpr_32 = V_SUB_I32_e64 %6.sub1, %7, 0, implicit $exec
+ %9:vgpr_32 = V_MAX_I32_e32 %8, %2, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.3.if.else:
+ successors: %bb.4(0x80000000)
+
+ %10:vreg_64 = V_ASHR_I64_e64 %0, %1, implicit $exec
+ %11:vgpr_32 = V_MUL_I32_I24_e32 4, %10.sub0, implicit $exec
+ %12:vgpr_32 = V_SUB_I32_e64 %10.sub1, %11, 0, implicit $exec
+ %13:vgpr_32 = V_MIN_I32_e32 %12, %3, implicit $exec
+
+ bb.4.if.end:
+ successors: %bb.1(0x40000000), %bb.5(0x40000000)
+
+ %14:vgpr_32 = PHI %9, %bb.2, %13, %bb.3
+ %15:vreg_64 = COPY %0
+ FLAT_STORE_DWORD %15, %14, 0, 0, implicit $exec, implicit $flat_scr
+ %16:sreg_64 = V_CMP_EQ_U32_e64 %14, 2, implicit $exec
+ S_CMP_EQ_U32 %16.sub0, 10, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit $scc
+ S_BRANCH %bb.5
+
+ bb.5.while.end:
+ S_ENDPGM 0
+
+...
More information about the llvm-commits
mailing list