[llvm] [AMDGPU] Support wide register or subregister access when emitting s_singleuse_vdst instructions. (PR #88520)
Scott Egerton via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 18 02:56:19 PDT 2024
https://github.com/ScottEgerton updated https://github.com/llvm/llvm-project/pull/88520
>From 42ac27015082ab16d55019a0c7864a1cb7a669b5 Mon Sep 17 00:00:00 2001
From: Scott Egerton <scott.egerton at amd.com>
Date: Mon, 15 Jan 2024 16:59:35 +0000
Subject: [PATCH 1/4] [AMDGPU] Support wide register or subregister access when
emitting s_singleuse_vdst instructions.
Both single use producer and consumer instructions using wide/sub
registers are now correctly tracked and eligible for being marked as
single use.
---
.../AMDGPU/AMDGPUInsertSingleUseVDST.cpp | 21 +++-
.../CodeGen/AMDGPU/insert-singleuse-vdst.mir | 109 +++++++++++++++++-
2 files changed, 119 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
index 93ed77bb6f7efe..dcae8c60429ef9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
@@ -64,13 +64,15 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
bool InstructionEmitted = false;
for (MachineBasicBlock &MBB : MF) {
- DenseMap<MCPhysReg, unsigned> RegisterUseCount; // TODO: MCRegUnits
+ DenseMap<MCRegUnit, unsigned> RegisterUseCount;
// Handle boundaries at the end of basic block separately to avoid
// false positives. If they are live at the end of a basic block then
// assume it has more uses later on.
- for (const auto &Liveouts : MBB.liveouts())
- RegisterUseCount[Liveouts.PhysReg] = 2;
+ for (const auto &Liveout : MBB.liveouts()) {
+ for (const MCRegUnit &Unit : TRI->regunits(Liveout.PhysReg))
+ RegisterUseCount[Unit] = 2;
+ }
for (MachineInstr &MI : reverse(MBB.instrs())) {
// All registers in all operands need to be single use for an
@@ -84,7 +86,8 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
// Count the number of times each register is read.
if (Operand.readsReg())
- RegisterUseCount[Reg]++;
+ for (const MCRegUnit &Unit : TRI->regunits(Reg))
+ RegisterUseCount[Unit]++;
// Do not attempt to optimise across exec mask changes.
if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
@@ -96,10 +99,16 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
// check if the operands are single use.
if (!MI.modifiesRegister(Reg, TRI))
continue;
- if (RegisterUseCount[Reg] > 1)
+
+ const auto RegUnits = TRI->regunits(Reg);
+ if (any_of(RegUnits, [&RegisterUseCount](const MCRegUnit &Unit) {
+ return RegisterUseCount[Unit] > 1;
+ }))
AllProducerOperandsAreSingleUse = false;
+
// Reset uses count when a register is no longer live.
- RegisterUseCount.erase(Reg);
+ for (const MCRegUnit &Unit : RegUnits)
+ RegisterUseCount[Unit] = 0;
}
if (AllProducerOperandsAreSingleUse && SIInstrInfo::isVALU(MI)) {
// TODO: Replace with candidate logging for instruction grouping
diff --git a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
index 833699b4656b64..065226e0a42f0d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
@@ -398,7 +398,6 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: $exec = COPY $sgpr0_sgpr1
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
@@ -424,7 +423,6 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $sgpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: $exec_lo = COPY $sgpr0
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
@@ -450,7 +448,6 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $sgpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: $exec_hi = COPY $sgpr0
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
@@ -521,9 +518,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec
- ; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
@@ -582,6 +577,32 @@ body: |
liveins: $vgpr1
...
+# Write low 16-bits and then read 32-bit vgpr twice.
+---
+name: write_lo_read_full_twice
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: write_lo_read_full_twice
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: liveins: $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ bb.0:
+ liveins: $vgpr0
+ $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ bb.1:
+ liveins: $vgpr1, $vgpr2
+...
+
# Write high 16-bits and then read 32-bit vgpr.
---
name: write_hi_read_full
@@ -605,3 +626,81 @@ body: |
bb.1:
liveins: $vgpr1
...
+
+# Write high 16-bits and then read 32-bit vgpr twice.
+---
+name: write_hi_read_full_twice
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: write_hi_read_full_twice
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: liveins: $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ bb.0:
+ liveins: $vgpr0
+ $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ bb.1:
+ liveins: $vgpr1, $vgpr2
+...
+
+# Write low 16-bits and then write high 16-bits and then read 32-bit vgpr.
+---
+name: write_both_read_full
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: write_both_read_full
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_SINGLEUSE_VDST 1
+ ; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
+ ; CHECK-NEXT: S_SINGLEUSE_VDST 1
+ ; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: liveins: $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ bb.0:
+ $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
+ $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ bb.1:
+ liveins: $vgpr1
+...
+
+# Write low 16-bits and then write high 16-bits and then read 32-bit vgpr twice.
+---
+name: write_both_read_full_twice
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: write_both_read_full_twice
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
+ ; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
+ ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: liveins: $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ bb.0:
+ $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
+ $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+ bb.1:
+ liveins: $vgpr1, $vgpr2
+...
>From 5436e939dffddcd92296b37f58903dde5be80221 Mon Sep 17 00:00:00 2001
From: Scott Egerton <scott.egerton at amd.com>
Date: Tue, 16 Apr 2024 16:14:49 +0100
Subject: [PATCH 2/4] fixup! [AMDGPU] Support wide register or subregister
access when emitting s_singleuse_vdst instructions.
Change-Id: If41aa496d852d0ac1fddfc75f19acbfa20d2309e
---
llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir | 4 ----
1 file changed, 4 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
index 065226e0a42f0d..f12a8e95593fe2 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
@@ -593,7 +593,6 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: liveins: $vgpr1, $vgpr2
- ; CHECK-NEXT: {{ $}}
bb.0:
liveins: $vgpr0
$vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
@@ -643,7 +642,6 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: liveins: $vgpr1, $vgpr2
- ; CHECK-NEXT: {{ $}}
bb.0:
liveins: $vgpr0
$vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
@@ -670,7 +668,6 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: liveins: $vgpr1
- ; CHECK-NEXT: {{ $}}
bb.0:
$vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
$vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
@@ -695,7 +692,6 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: liveins: $vgpr1, $vgpr2
- ; CHECK-NEXT: {{ $}}
bb.0:
$vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
$vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
>From d0c8bec6e24a6da808677aa2a59d027262b1d33a Mon Sep 17 00:00:00 2001
From: Scott Egerton <scott.egerton at amd.com>
Date: Wed, 17 Apr 2024 11:45:23 +0100
Subject: [PATCH 3/4] Separate out exec mask fix
Changing erasing unused registers to setting them
to 0 fixes an issue where single use values were
being incorrectly detected across exec mask
changes. However this is not relevant to wise/sub
register changes and will be included
in a different patch.
---
llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp | 2 +-
llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir | 3 +++
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
index dcae8c60429ef9..7a383528527a22 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
@@ -108,7 +108,7 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
// Reset uses count when a register is no longer live.
for (const MCRegUnit &Unit : RegUnits)
- RegisterUseCount[Unit] = 0;
+ RegisterUseCount.erase(Unit);
}
if (AllProducerOperandsAreSingleUse && SIInstrInfo::isVALU(MI)) {
// TODO: Replace with candidate logging for instruction grouping
diff --git a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
index f12a8e95593fe2..135a101822bfa6 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
@@ -398,6 +398,7 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: $exec = COPY $sgpr0_sgpr1
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
@@ -423,6 +424,7 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $sgpr0
; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: $exec_lo = COPY $sgpr0
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
@@ -448,6 +450,7 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $sgpr0
; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_SINGLEUSE_VDST 1
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: $exec_hi = COPY $sgpr0
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
>From af6aae86924c66c66088b16593a9786c09d9dd7e Mon Sep 17 00:00:00 2001
From: Scott Egerton <scott.egerton at amd.com>
Date: Thu, 18 Apr 2024 10:25:11 +0100
Subject: [PATCH 4/4] Make use of MCRegUnitMaskIterator
This allows checking that liveout registers are
enabled in Liveout.LaneMask before excluding them from single use
optimisations.
---
llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
index 7a383528527a22..f6a63502ec2cda 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
@@ -70,8 +70,12 @@ class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
// false positives. If they are live at the end of a basic block then
// assume it has more uses later on.
for (const auto &Liveout : MBB.liveouts()) {
- for (const MCRegUnit &Unit : TRI->regunits(Liveout.PhysReg))
- RegisterUseCount[Unit] = 2;
+ for (MCRegUnitMaskIterator Units(Liveout.PhysReg, TRI); Units.isValid();
+ ++Units) {
+ const auto [Unit, Mask] = *Units;
+ if ((Mask & Liveout.LaneMask).any())
+ RegisterUseCount[Unit] = 2;
+ }
}
for (MachineInstr &MI : reverse(MBB.instrs())) {
More information about the llvm-commits
mailing list