[llvm] MachineLICM: Allow hoisting REG_SEQUENCE (PR #90638)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 30 10:59:33 PDT 2024
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/90638
This is just a fancy copy. Extend the copy handling to cover
reg_sequence with only virtual register uses. This avoids some
test regressions in a future commit.
>From a34f1fadf5c7dce5e226bc290bae9323bf5f3860 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 30 Apr 2024 18:56:48 +0200
Subject: [PATCH 1/2] AMDGPU: Add baseline test for machinelicm handling
---
.../AMDGPU/machinelicm-copy-like-instrs.mir | 134 ++++++++++++++++++
1 file changed, 134 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir
diff --git a/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir b/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir
new file mode 100644
index 00000000000000..9b346b73ffbc88
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir
@@ -0,0 +1,134 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=amdgcn -run-pass=early-machinelicm -simplify-mir -o - %s | FileCheck %s
+
+# Test to check machine LICM does not hoist convergent instructions,
+# DS_PERMUTE_B32 in this example.
+
+---
+name: licm_reg_sequence
+body: |
+ ; CHECK-LABEL: name: licm_reg_sequence
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; CHECK-NEXT: S_NOP 0, implicit [[REG_SEQUENCE]]
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ successors: %bb.1
+
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ %3:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1
+ S_NOP 0, implicit %3
+ S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ $vgpr0 = COPY %3
+ S_ENDPGM 0
+
+...
+
+# Don't bother handling reg_sequence with physreg uses (is there any
+# reason for these to be legal)?
+---
+name: licm_reg_sequence_physreg_use
+body: |
+ ; CHECK-LABEL: name: licm_reg_sequence_physreg_use
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, $vgpr1, %subreg.sub1
+ ; CHECK-NEXT: S_NOP 0, implicit [[REG_SEQUENCE]]
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ successors: %bb.1
+
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+ liveins: $vgpr0
+
+ %3:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, $vgpr1, %subreg.sub1
+ S_NOP 0, implicit %3
+ S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ $vgpr0 = COPY %3
+ S_ENDPGM 0
+
+...
+
+---
+name: licm_insert_subreg
+body: |
+ ; CHECK-LABEL: name: licm_insert_subreg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vreg_64 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.sub0
+ ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vreg_64 = INSERT_SUBREG [[INSERT_SUBREG]], [[COPY1]], %subreg.sub1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: S_NOP 0, implicit [[INSERT_SUBREG1]]
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[INSERT_SUBREG1]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ successors: %bb.1
+
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ %3:vreg_64 = IMPLICIT_DEF
+ %4:vreg_64 = INSERT_SUBREG %3, %0, %subreg.sub0
+ %5:vreg_64 = INSERT_SUBREG %4, %1, %subreg.sub1
+ S_NOP 0, implicit %5
+ S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ $vgpr0_vgpr1 = COPY %5
+ S_ENDPGM 0
+
+...
>From b3e1f07ed056854d60f33c69e9f358dbee6515b9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 30 Apr 2024 17:37:24 +0200
Subject: [PATCH 2/2] MachineLICM: Allow hoisting REG_SEQUENCE
This is just a fancy copy. Extend the copy handling to cover
reg_sequence with only virtual register uses. This avoids some
test regressions in a future commit.
---
llvm/lib/CodeGen/MachineLICM.cpp | 45 +++++++++++--------
.../AMDGPU/global_atomics_i64_system.ll | 10 ++---
.../AMDGPU/machinelicm-copy-like-instrs.mir | 2 +-
.../CodeGen/AMDGPU/optimize-negated-cond.ll | 4 +-
.../CodeGen/Hexagon/expand-vstorerw-undef.ll | 1 +
5 files changed, 35 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d569a082cebe06..727a98c41bce4c 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1264,25 +1264,32 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI,
// If we have a COPY with other uses in the loop, hoist to allow the users to
// also be hoisted.
- Register DefReg;
- if (MI.isCopy() && (DefReg = MI.getOperand(0).getReg()).isVirtual() &&
- MI.getOperand(1).getReg().isVirtual() &&
- IsLoopInvariantInst(MI, CurLoop) &&
- any_of(MRI->use_nodbg_instructions(MI.getOperand(0).getReg()),
- [&CurLoop, this, DefReg, Cost](MachineInstr &UseMI) {
- if (!CurLoop->contains(&UseMI))
- return false;
-
- // COPY is a cheap instruction, but if moving it won't cause high
- // RP we're fine to hoist it even if the user can't be hoisted
- // later Otherwise we want to check the user if it's hoistable
- if (CanCauseHighRegPressure(Cost, false) &&
- !CurLoop->isLoopInvariant(UseMI, DefReg))
- return false;
-
- return true;
- }))
- return true;
+ // TODO: Handle all isCopyLike?
+ if (MI.isCopy() || MI.isRegSequence()) {
+ Register DefReg = MI.getOperand(0).getReg();
+ if (DefReg.isVirtual() &&
+ all_of(MI.uses(),
+ [](const MachineOperand &UseOp) {
+ return !UseOp.isReg() || UseOp.getReg().isVirtual();
+ }) &&
+ IsLoopInvariantInst(MI, CurLoop) &&
+ any_of(MRI->use_nodbg_instructions(DefReg),
+ [&CurLoop, this, DefReg, Cost](MachineInstr &UseMI) {
+ if (!CurLoop->contains(&UseMI))
+ return false;
+
+ // COPY is a cheap instruction, but if moving it won't cause
+ // high RP we're fine to hoist it even if the user can't be
+ // hoisted later Otherwise we want to check the user if it's
+ // hoistable
+ if (CanCauseHighRegPressure(Cost, false) &&
+ !CurLoop->isLoopInvariant(UseMI, DefReg))
+ return false;
+
+ return true;
+ }))
+ return true;
+ }
// High register pressure situation, only hoist if the instruction is going
// to be remat'ed.
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index f5c2bd6286cb8e..41a883302e8f70 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -8907,17 +8907,17 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x0
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: v_mov_b32_e32 v4, s3
; SI-NEXT: v_mov_b32_e32 v5, s2
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s10
-; SI-NEXT: v_mov_b32_e32 v3, s11
+; SI-NEXT: v_mov_b32_e32 v2, s4
+; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: .LBB127_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir b/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir
index 9b346b73ffbc88..e9945f005d2645 100644
--- a/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir
+++ b/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir
@@ -13,9 +13,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; CHECK-NEXT: S_NOP 0, implicit [[REG_SEQUENCE]]
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
; CHECK-NEXT: S_BRANCH %bb.2
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
index 7c351d2b8443b1..a50a0766f67c2c 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
@@ -8,9 +8,10 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s10, -1
; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s11, s7
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s9, s5
; GCN-NEXT: s_mov_b32 s8, s4
+; GCN-NEXT: s_mov_b32 s9, s5
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_branch .LBB0_2
; GCN-NEXT: .LBB0_1: ; %loop.exit.guard
@@ -20,7 +21,6 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
; GCN-NEXT: .LBB0_2: ; %bb1
; GCN-NEXT: ; =>This Loop Header: Depth=1
; GCN-NEXT: ; Child Loop BB0_4 Depth 2
-; GCN-NEXT: s_mov_b32 s11, s7
; GCN-NEXT: buffer_load_dword v1, off, s[8:11], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v1
diff --git a/llvm/test/CodeGen/Hexagon/expand-vstorerw-undef.ll b/llvm/test/CodeGen/Hexagon/expand-vstorerw-undef.ll
index 867ce3b930f8fc..69ba266227265c 100644
--- a/llvm/test/CodeGen/Hexagon/expand-vstorerw-undef.ll
+++ b/llvm/test/CodeGen/Hexagon/expand-vstorerw-undef.ll
@@ -69,6 +69,7 @@ b18: ; preds = %b16, %b7
br label %b22
b21: ; preds = %b22
+ store volatile <64 x i32> %v20, ptr null
tail call void @sammy() #3
br label %b7
More information about the llvm-commits
mailing list