[llvm] [MachineCSE] Enhance MachineCSE simple PRE to find common subexpressi… (PR #129860)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Mar 8 22:45:51 PST 2025
================
@@ -0,0 +1,142 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -run-pass=machine-cse -o - %s | FileCheck %s
+
+# MachineCSE pass can't hoist common subexpression completely in test case, this may produce more temporary
+# variable using from CMBB to then-BB and else-BB, and increase register pressure.
+# If we enhance SimplePRE and try to hoist subexpression completely, and eliminate redundant MI in CSE prcess.
+# this problem would be solved.
+
+# This test case has a while-loop with one if-else, the meaning of while-loop exsiting is
+# for meeting then-bb and else-bb are Potentially Reachable.
+# then-bb and else-bb have common subexpression, %8 are from predecessor.
+#
+# %12:vgpr_32 = V_MUL_I32_I24_e32 1, %8, implicit $exec
+# %13:vgpr_32 = V_MUL_I32_I24_e32 4, %12, implicit $exec
+# %14:vgpr_32 = V_SUB_I32_e64 %12, %13, 0, implicit $exec
+#
+# Before this patch Machine-CSE if just hoist first and second MIs, and maybe hoisted V_MUL_I32_I24_e32s are also
+# following other MIs in CMBB, so from CMBB to then-bb or else-bb the register %12 and %13's liverange would be
+# more longer, it increases register pressure.
+
+
+--- |
+ define amdgpu_kernel void @not_cse_completely(ptr addrspace(1), i32 %a, i1) {
+ entry:
+ br label %while.cond
+ while.cond:
+ %cmp = trunc i32 %a to i1
+ br i1 %cmp, label %if.then, label %if.else
+ if.then:
+ br label %if.end
+ if.else:
+ br label %if.end
+ if.end:
+ %cmp2 = trunc i32 %a to i1
+ br i1 %cmp, label %while.cond, label %while.end
+ while.end:
+ ret void
+ }
+...
+---
+name: not_cse_completely
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: not_cse_completely
+ ; CHECK: bb.0.entry:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5, $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 0, 0
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 2, 0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY2]].sub0, 64, 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 64, [[COPY2]].sub1, 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY3]].sub0, 64, 0, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 64, [[COPY3]].sub1, 0, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1.while.cond:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 42
+ ; CHECK-NEXT: S_CMP_EQ_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[V_MUL_I32_I24_e32_:%[0-9]+]]:vgpr_32 = V_MUL_I32_I24_e32 [[V_ADD_U32_e64_]], [[V_ADD_U32_e64_1]], implicit $exec
+ ; CHECK-NEXT: [[V_MUL_I32_I24_e32_1:%[0-9]+]]:vgpr_32 = V_MUL_I32_I24_e32 [[V_MUL_I32_I24_e32_]], [[V_ADD_U32_e64_2]], implicit $exec
+ ; CHECK-NEXT: [[V_SUB_I32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_I32_e64 [[V_MUL_I32_I24_e32_]], [[V_MUL_I32_I24_e32_1]], 0, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2.if.then:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_MAX_I32_e32_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e32 [[V_SUB_I32_e64_]], [[V_ADD_U32_e64_3]], implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3.if.else:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_MIN_I32_e32_:%[0-9]+]]:vgpr_32 = V_MIN_I32_e32 [[V_SUB_I32_e64_]], [[V_ADD_U32_e64_2]], implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4.if.end:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.5(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[V_MAX_I32_e32_]], %bb.2, [[V_MIN_I32_e32_]], %bb.3
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; CHECK-NEXT: FLAT_STORE_DWORD [[COPY5]], [[PHI]], 0, 0, implicit $exec, implicit $flat_scr
+ ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[PHI]], 2, implicit $exec
+ ; CHECK-NEXT: S_CMP_EQ_U32 [[V_CMP_EQ_U32_e64_]].sub0, 10, implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit $scc
+ ; CHECK-NEXT: S_BRANCH %bb.5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.5.while.end:
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5, $vgpr0_vgpr1, $vgpr2_vgpr3
+ %0:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0(p4), 0, 0
+ %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0(p4), 2, 0
+ %3:sreg_64 = COPY %1
+ %6:vreg_64 = COPY $vgpr0_vgpr1
+ %7:vreg_64 = COPY $vgpr2_vgpr3
+ %8:vgpr_32 = V_ADD_U32_e64 %6.sub0, 64, 0, implicit $exec
+ %10:vgpr_32 = V_ADD_U32_e64 %7.sub0, 64, 0, implicit $exec
+ %11:vgpr_32 = V_ADD_U32_e64 64, %7.sub1, 0, implicit $exec
+ S_BRANCH %bb.1
----------------
emelliu wrote:
I update case with subreg using and compacting the reg number.
https://github.com/llvm/llvm-project/pull/129860
More information about the llvm-commits
mailing list