[llvm] [MachineCSE] Enhance MachineCSE simple PRE to find common subexpressi… (PR #129860)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 5 02:10:28 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: None (emelliu)
<details>
<summary>Changes</summary>
…on completely (#<!-- -->129516).
When IF-THEN-ELSE in a loop, THEN and ELSE bb may have multiple MIs subexpression, but now
PRE process just can find one MI and hoist, this may result in increasing many temp registers
which liverange is from CMBB to THEN-ELSE BBs, this may cause register pressure.
This patch try to find all MIs in common subexpression and hoist to CMBB in PRE process.
---
Full diff: https://github.com/llvm/llvm-project/pull/129860.diff
2 Files Affected:
- (modified) llvm/lib/CodeGen/MachineCSE.cpp (+17)
- (added) llvm/test/CodeGen/AMDGPU/not-cse-completely.mir (+126)
``````````diff
diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp
index 6d14509c5934f..9cdbcbafb6201 100644
--- a/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/llvm/lib/CodeGen/MachineCSE.cpp
@@ -887,6 +887,23 @@ bool MachineCSEImpl::ProcessBlockPRE(MachineDominatorTree *DT,
NewMI.setDebugLoc(EmptyDL);
NewMI.getOperand(0).setReg(NewReg);
+ for (MachineOperand &MO :
+ llvm::make_early_inc_range(MRI->use_nodbg_operands(VReg))) {
+ if (MO.isUse()) {
+ MO.setReg(NewReg);
+ }
+ }
+ auto *SiblingBBMI = PREMap.try_emplace(&MI).first->getFirst();
+ for (MachineOperand &MO :
+ llvm::make_early_inc_range(MRI->use_nodbg_operands(
+ SiblingBBMI->getOperand(0).getReg()))) {
+ if (MO.isUse()) {
+ MachineInstr *UseMI = MO.getParent();
+ PREMap.erase(UseMI);
+ MO.setReg(NewReg);
+ PREMap[UseMI] = UseMI->getParent();
+ }
+ }
PREMap[&MI] = CMBB;
++NumPREs;
diff --git a/llvm/test/CodeGen/AMDGPU/not-cse-completely.mir b/llvm/test/CodeGen/AMDGPU/not-cse-completely.mir
new file mode 100644
index 0000000000000..c503fff062332
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/not-cse-completely.mir
@@ -0,0 +1,126 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -run-pass=machine-cse -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -passes=machine-cse -o - %s | FileCheck %s
+
+--- |
+ define amdgpu_kernel void @not_cse_completely(ptr addrspace(1), i32 %a, i1) {
+ entry:
+ br label %while.cond
+ while.cond:
+ %cmp = trunc i32 %a to i1
+ br i1 %cmp, label %if.then, label %if.else
+ if.then:
+ br label %if.end
+ if.else:
+ br label %if.end
+ if.end:
+ %cmp2 = trunc i32 %a to i1
+ br i1 %cmp, label %while.cond, label %while.end
+ while.end:
+ ret void
+ }
+...
+---
+name: not_cse_completely
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: not_cse_completely
+ ; CHECK: bb.0.entry:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 0, 0
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 2, 0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY2]].sub0, 64, 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 64, [[COPY2]].sub1, 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY3]].sub0, 64, 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 64, [[COPY3]].sub1, 0, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1.while.cond:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 42
+ ; CHECK-NEXT: S_CMP_EQ_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[V_MUL_I32_I24_e32_:%[0-9]+]]:vgpr_32 = V_MUL_I32_I24_e32 [[V_ADD_U32_e64_]], [[V_ADD_U32_e64_1]], implicit $exec
+ ; CHECK-NEXT: [[V_MUL_I32_I24_e32_1:%[0-9]+]]:vgpr_32 = V_MUL_I32_I24_e32 [[V_MUL_I32_I24_e32_]], [[V_ADD_U32_e64_2]], implicit $exec
+ ; CHECK-NEXT: [[V_SUB_I32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_I32_e64 [[V_MUL_I32_I24_e32_]], [[V_MUL_I32_I24_e32_1]], 0, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2.if.then:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_MAX_I32_e32_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e32 [[V_SUB_I32_e64_]], [[V_ADD_U32_e64_3]], implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3.if.else:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_MIN_I32_e32_:%[0-9]+]]:vgpr_32 = V_MIN_I32_e32 [[V_SUB_I32_e64_]], [[V_ADD_U32_e64_2]], implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4.if.end:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MAX_I32_e32_]], %bb.2, [[V_MIN_I32_e32_]], %bb.3
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY5]], [[PHI]], 0, 0, implicit $exec, implicit $flat_scr
+ ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[PHI]], 2, implicit $exec
+ ; CHECK-NEXT: S_CMP_EQ_U32 [[V_CMP_EQ_U32_e64_]].sub0, 10, implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
+ ; CHECK-NEXT: S_BRANCH %bb.5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.5.while.end:
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5, $vgpr0_vgpr1, $vgpr2_vgpr3
+ %0:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0(p4), 0, 0
+ %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0(p4), 2, 0
+ %3:sreg_64 = COPY %1
+ %6:vreg_64 = COPY $vgpr0_vgpr1
+ %7:vreg_64 = COPY $vgpr2_vgpr3
+ %8:vgpr_32 = V_ADD_U32_e64 %6.sub0, 64, 0, implicit $exec
+ %9:vgpr_32 = V_ADD_U32_e64 64, %6.sub1, 0, implicit $exec
+ %10:vgpr_32 = V_ADD_U32_e64 %7.sub0, 64, 0, implicit $exec
+ %11:vgpr_32 = V_ADD_U32_e64 64, %7.sub1, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1.while.cond:
+ %4:sreg_32 = COPY %2.sub1
+ %5:sreg_32 = S_MOV_B32 42
+ S_CMP_EQ_U32 %4, %5, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.3, implicit $scc
+ S_BRANCH %bb.2
+
+ bb.2.if.then:
+ %12:vgpr_32 = V_MUL_I32_I24_e32 %8, %9, implicit $exec
+ %13:vgpr_32 = V_MUL_I32_I24_e32 %12, %10, implicit $exec
+ %14:vgpr_32 = V_SUB_I32_e64 %12, %13, 0, implicit $exec
+ %15:vgpr_32 = V_MAX_I32_e32 %14, %11, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.3.if.else:
+ %16:vgpr_32 = V_MUL_I32_I24_e32 %8, %9, implicit $exec
+ %17:vgpr_32 = V_MUL_I32_I24_e32 %16, %10, implicit $exec
+ %18:vgpr_32 = V_SUB_I32_e64 %16, %17, 0, implicit $exec
+ %19:vgpr_32 = V_MIN_I32_e32 %18, %10, implicit $exec
+
+ bb.4.if.end:
+ %20:vgpr_32 = PHI %15, %bb.2, %19, %bb.3
+ %21:vreg_64 = COPY %1
+ FLAT_STORE_DWORD %21, %20, 0, 0, implicit $exec, implicit $flat_scr
+ %22:sreg_64 = V_CMP_EQ_U32_e64 %20, 2, implicit $exec
+ S_CMP_EQ_U32 %22.sub0, 10, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit $scc
+ S_BRANCH %bb.5
+
+ bb.5.while.end:
+ S_ENDPGM 0
+
+...
``````````
</details>
https://github.com/llvm/llvm-project/pull/129860
More information about the llvm-commits
mailing list