[llvm] [MachineCSE] Enhance MachineCSE simple PRE to find common subexpressi… (PR #129860)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 6 23:53:48 PST 2025
https://github.com/emelliu updated https://github.com/llvm/llvm-project/pull/129860
>From 45bcb9f49d31d9893c33eea592757e32e2b398fd Mon Sep 17 00:00:00 2001
From: "xiangyou.liu" <xiangyou.liu at iluvatar.com>
Date: Wed, 26 Feb 2025 14:25:44 +0800
Subject: [PATCH] [MachineCSE] Enhance MachineCSE simple PRE to find common
subexpression completely (#129516).
When IF-THEN-ELSE in a loop, THEN and ELSE bb may have multiple MIs subexpression, but now
PRE process just can find one MI and hoist, this may result in increasing many temp registers
which liverange is from CMBB to THEN-ELSE BBs, this may cause register pressure.
This patch try to find all MIs in common subexpression and hoist to CMBB in PRE process.
Signed-off-by: xiangyou.liu <xiangyou.liu at iluvatar.com>
---
llvm/lib/CodeGen/MachineCSE.cpp | 17 +++
.../CodeGen/AMDGPU/not-cse-completely.mir | 141 ++++++++++++++++++
2 files changed, 158 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/not-cse-completely.mir
diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp
index 6d14509c5934f..9cdbcbafb6201 100644
--- a/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/llvm/lib/CodeGen/MachineCSE.cpp
@@ -887,6 +887,23 @@ bool MachineCSEImpl::ProcessBlockPRE(MachineDominatorTree *DT,
NewMI.setDebugLoc(EmptyDL);
NewMI.getOperand(0).setReg(NewReg);
+ for (MachineOperand &MO :
+ llvm::make_early_inc_range(MRI->use_nodbg_operands(VReg))) {
+ if (MO.isUse()) {
+ MO.setReg(NewReg);
+ }
+ }
+ auto *SiblingBBMI = PREMap.try_emplace(&MI).first->getFirst();
+ for (MachineOperand &MO :
+ llvm::make_early_inc_range(MRI->use_nodbg_operands(
+ SiblingBBMI->getOperand(0).getReg()))) {
+ if (MO.isUse()) {
+ MachineInstr *UseMI = MO.getParent();
+ PREMap.erase(UseMI);
+ MO.setReg(NewReg);
+ PREMap[UseMI] = UseMI->getParent();
+ }
+ }
PREMap[&MI] = CMBB;
++NumPREs;
diff --git a/llvm/test/CodeGen/AMDGPU/not-cse-completely.mir b/llvm/test/CodeGen/AMDGPU/not-cse-completely.mir
new file mode 100644
index 0000000000000..85f7afdc0ce7e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/not-cse-completely.mir
@@ -0,0 +1,141 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -run-pass=machine-cse -o - %s | FileCheck %s
+
+# MachineCSE pass can't hoist common subexpression completely in test case, this may produce more temporary
+# variable using from CMBB to then-BB and else-BB, and increase register pressure.
+# If we enhance SimplePRE and try to hoist subexpression completely, and eliminate redundant MI in CSE prcess.
+# this problem would be solved.
+
+# This test case has a while-loop with one if-else, the meaning of while-loop exsiting is
+# for meeting then-bb and else-bb are Potentially Reachable.
+# then-bb and else-bb have common subexpression, %8 are from predecessor.
+#
+# %6:vreg_64 = V_ASHR_I64_e64 %0, %1, implicit $exec
+# %7:vgpr_32 = V_MUL_I32_I24_e32 4, %6.sub0, implicit $exec
+# %8:vgpr_32 = V_SUB_I32_e64 %6.sub1, %7, 0, implicit $exec
+#
+# Before this patch Machine-CSE if just hoist first and second MIs, and maybe hoisted V_MUL_I32_I24_e32s are also
+# following other MIs in CMBB, so from CMBB to then-bb or else-bb the register %12 and %13's liverange would be
+# more longer, it increases register pressure.
+
+
+--- |
+ define amdgpu_kernel void @not_cse_completely(ptr addrspace(1), i32 %a, i1) {
+ entry:
+ br label %while.cond
+ while.cond:
+ %cmp = trunc i32 %a to i1
+ br i1 %cmp, label %if.then, label %if.else
+ if.then:
+ br label %if.end
+ if.else:
+ br label %if.end
+ if.end:
+ %cmp2 = trunc i32 %a to i1
+ br i1 %cmp, label %while.cond, label %while.end
+ while.end:
+ ret void
+ }
+...
+---
+name: not_cse_completely
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: not_cse_completely
+ ; CHECK: bb.0.entry:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1, $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1.while.cond:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 42
+ ; CHECK-NEXT: S_CMP_EQ_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[V_ASHR_I64_e64_:%[0-9]+]]:vreg_64 = V_ASHR_I64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; CHECK-NEXT: [[V_MUL_I32_I24_e32_:%[0-9]+]]:vgpr_32 = V_MUL_I32_I24_e32 4, [[V_ASHR_I64_e64_]].sub0, implicit $exec
+ ; CHECK-NEXT: [[V_SUB_I32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_I32_e64 [[V_ASHR_I64_e64_]].sub1, [[V_MUL_I32_I24_e32_]], 0, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2.if.then:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_MAX_I32_e32_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e32 [[V_SUB_I32_e64_]], [[COPY2]], implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3.if.else:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_MIN_I32_e32_:%[0-9]+]]:vgpr_32 = V_MIN_I32_e32 [[V_SUB_I32_e64_]], [[COPY3]], implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4.if.end:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MAX_I32_e32_]], %bb.2, [[V_MIN_I32_e32_]], %bb.3
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[COPY]]
+ ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY5]], [[PHI]], 0, 0, implicit $exec, implicit $flat_scr
+ ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[PHI]], 2, implicit $exec
+ ; CHECK-NEXT: S_CMP_EQ_U32 [[V_CMP_EQ_U32_e64_]].sub0, 10, implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
+ ; CHECK-NEXT: S_BRANCH %bb.5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.5.while.end:
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0.entry:
+ successors: %bb.1(0x80000000)
+ liveins: $sgpr0_sgpr1, $vgpr0, $vgpr1, $vgpr2
+
+ %0:sgpr_64 = COPY $sgpr0_sgpr1
+ %1:vgpr_32 = COPY $vgpr0
+ %2:vgpr_32 = COPY $vgpr1
+ %3:vgpr_32 = COPY $vgpr2
+ S_BRANCH %bb.1
+
+ bb.1.while.cond:
+ successors: %bb.3(0x40000000), %bb.2(0x40000000)
+
+ %4:sreg_32 = COPY %0.sub1
+ %5:sreg_32 = S_MOV_B32 42
+ S_CMP_EQ_U32 %4, %5, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.3, implicit $scc
+ S_BRANCH %bb.2
+
+ bb.2.if.then:
+ successors: %bb.4(0x80000000)
+
+ %6:vreg_64 = V_ASHR_I64_e64 %0, %1, implicit $exec
+ %7:vgpr_32 = V_MUL_I32_I24_e32 4, %6.sub0, implicit $exec
+ %8:vgpr_32 = V_SUB_I32_e64 %6.sub1, %7, 0, implicit $exec
+ %9:vgpr_32 = V_MAX_I32_e32 %8, %2, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.3.if.else:
+ successors: %bb.4(0x80000000)
+
+ %10:vreg_64 = V_ASHR_I64_e64 %0, %1, implicit $exec
+ %11:vgpr_32 = V_MUL_I32_I24_e32 4, %10.sub0, implicit $exec
+ %12:vgpr_32 = V_SUB_I32_e64 %10.sub1, %11, 0, implicit $exec
+ %13:vgpr_32 = V_MIN_I32_e32 %12, %3, implicit $exec
+
+ bb.4.if.end:
+ successors: %bb.1(0x40000000), %bb.5(0x40000000)
+
+ %14:vgpr_32 = PHI %9, %bb.2, %13, %bb.3
+ %15:vreg_64 = COPY %0
+ FLAT_STORE_DWORD %15, %14, 0, 0, implicit $exec, implicit $flat_scr
+ %16:sreg_64 = V_CMP_EQ_U32_e64 %14, 2, implicit $exec
+ S_CMP_EQ_U32 %16.sub0, 10, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit $scc
+ S_BRANCH %bb.5
+
+ bb.5.while.end:
+ S_ENDPGM 0
+
+...
More information about the llvm-commits
mailing list