[llvm] [AMDGPU] Fix SDWA 'preserve' transformation for instructions in different basic blocks. (PR #82406)
Valery Pykhtin via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 22 09:15:26 PST 2024
https://github.com/vpykhtin updated https://github.com/llvm/llvm-project/pull/82406
>From 95d056331ed0a0d655afaae124822fe4b0a05490 Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin at gmail.com>
Date: Tue, 20 Feb 2024 17:43:42 +0100
Subject: [PATCH 1/3] [AMDGPU] Prevent SDWA preserve transformation for
instructions in different basic blocks.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 6 +-
llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir | 67 ++++++++++++++++++++++
2 files changed, 72 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 53fc2c0686245f..739ee17464edec 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -739,6 +739,11 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineInstr *SDWAInst = OrSDWADef->getParent();
MachineInstr *OtherInst = OrOtherDef->getParent();
+ // Instruction and operand sources should reside in the same BB.
+ if (SDWAInst->getParent() != MI.getParent() ||
+ OtherInst->getParent() != MI.getParent())
+ break;
+
// Check that OtherInstr is actually bitwise compatible with SDWAInst = their
// destination patterns don't overlap. Compatible instruction can be either
// regular instruction with compatible bitness or SDWA instruction with
@@ -815,7 +820,6 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
return std::make_unique<SDWADstPreserveOperand>(
OrDst, OrSDWADef, OrOtherDef, DstSel);
-
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
index f93456ccacb806..359945ff799434 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
@@ -160,3 +160,70 @@ body: |
S_ENDPGM 0
...
+---
+name: add_f16_u32_preserve_different_bb
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vreg_64 }
+ - { id: 1, class: vreg_64 }
+ - { id: 2, class: sreg_64 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: vgpr_32 }
+ - { id: 5, class: vgpr_32 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+ - { id: 8, class: vgpr_32 }
+ - { id: 9, class: vgpr_32 }
+ - { id: 10, class: vgpr_32 }
+ - { id: 11, class: vgpr_32 }
+ - { id: 12, class: vgpr_32 }
+ - { id: 13, class: vgpr_32 }
+body: |
+ ; SDWA-LABEL: name: add_f16_u32_preserve_different_bb
+ ; SDWA: bb.0:
+ ; SDWA-NEXT: successors: %bb.1(0x80000000)
+ ; SDWA-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
+ ; SDWA-NEXT: {{ $}}
+ ; SDWA-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
+ ; SDWA-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+ ; SDWA-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+ ; SDWA-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+ ; SDWA-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+ ; SDWA-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 65535, [[FLAT_LOAD_DWORD]], implicit $exec
+ ; SDWA-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[FLAT_LOAD_DWORD1]], implicit $exec
+ ; SDWA-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[FLAT_LOAD_DWORD]], 8, 8, implicit $exec
+ ; SDWA-NEXT: [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 24, [[FLAT_LOAD_DWORD1]], implicit $exec
+ ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 0, 4, 5, implicit $mode, implicit $exec
+ ; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec
+ ; SDWA-NEXT: {{ $}}
+ ; SDWA-NEXT: bb.1:
+ ; SDWA-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F16_sdwa]], [[V_MUL_F32_sdwa]], implicit $exec
+ ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_OR_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+ ; SDWA-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
+ ; SDWA-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
+
+ %2 = COPY $sgpr30_sgpr31
+ %1 = COPY $vgpr2_vgpr3
+ %0 = COPY $vgpr0_vgpr1
+ %3 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+ %4 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+
+ %5 = V_AND_B32_e32 65535, %3, implicit $exec
+ %6 = V_LSHRREV_B32_e64 16, %4, implicit $exec
+ %7 = V_BFE_U32_e64 %3, 8, 8, implicit $exec
+ %8 = V_LSHRREV_B32_e32 24, %4, implicit $exec
+
+ %9 = V_ADD_F16_e64 0, %5, 0, %6, 0, 0, implicit $mode, implicit $exec
+ %10 = V_LSHLREV_B16_e64 8, %9, implicit $exec
+ %11 = V_MUL_F32_e64 0, %7, 0, %8, 0, 0, implicit $mode, implicit $exec
+ %12 = V_LSHLREV_B32_e64 16, %11, implicit $exec
+
+ bb.1:
+ %13 = V_OR_B32_e64 %10, %12, implicit $exec
+
+ FLAT_STORE_DWORD %0, %13, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+ $sgpr30_sgpr31 = COPY %2
+ S_SETPC_B64_return $sgpr30_sgpr31
+...
>From ac500352e8dfe5c1d1b50225da6d9b3307de8015 Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin at gmail.com>
Date: Thu, 22 Feb 2024 17:51:44 +0100
Subject: [PATCH 2/3] fix SDWADstPreserveOperand::convertToSDWA to support
'preserve' transform across different basic blocks
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 12 +++---------
llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir | 15 ++++++++++-----
2 files changed, 13 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 739ee17464edec..8d9053a9df1c00 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -472,12 +472,11 @@ bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
}
// Move MI before v_or_b32
- auto MBB = MI.getParent();
- MBB->remove(&MI);
- MBB->insert(getParentInst(), &MI);
+ MI.getParent()->remove(&MI);
+ getParentInst()->getParent()->insert(getParentInst(), &MI);
// Add Implicit use of preserved register
- MachineInstrBuilder MIB(*MBB->getParent(), MI);
+ MachineInstrBuilder MIB(*MI.getMF(), MI);
MIB.addReg(getPreservedOperand()->getReg(),
RegState::ImplicitKill,
getPreservedOperand()->getSubReg());
@@ -739,11 +738,6 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineInstr *SDWAInst = OrSDWADef->getParent();
MachineInstr *OtherInst = OrOtherDef->getParent();
- // Instruction and operand sources should reside in the same BB.
- if (SDWAInst->getParent() != MI.getParent() ||
- OtherInst->getParent() != MI.getParent())
- break;
-
// Check that OtherInstr is actually bitwise compatible with SDWAInst = their
// destination patterns don't overlap. Compatible instruction can be either
// regular instruction with compatible bitness or SDWA instruction with
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
index 359945ff799434..7746afc3e6b264 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
@@ -193,12 +193,15 @@ body: |
; SDWA-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[FLAT_LOAD_DWORD1]], implicit $exec
; SDWA-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[FLAT_LOAD_DWORD]], 8, 8, implicit $exec
; SDWA-NEXT: [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 24, [[FLAT_LOAD_DWORD1]], implicit $exec
- ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 0, 4, 5, implicit $mode, implicit $exec
- ; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec
; SDWA-NEXT: {{ $}}
; SDWA-NEXT: bb.1:
- ; SDWA-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F16_sdwa]], [[V_MUL_F32_sdwa]], implicit $exec
- ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_OR_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+ ; SDWA-NEXT: successors: %bb.2(0x80000000)
+ ; SDWA-NEXT: {{ $}}
+ ; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec
+ ; SDWA-NEXT: {{ $}}
+ ; SDWA-NEXT: bb.2:
+ ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F32_sdwa]](tied-def 0)
+ ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_ADD_F16_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
; SDWA-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
; SDWA-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
bb.0:
@@ -217,10 +220,12 @@ body: |
%9 = V_ADD_F16_e64 0, %5, 0, %6, 0, 0, implicit $mode, implicit $exec
%10 = V_LSHLREV_B16_e64 8, %9, implicit $exec
+
+ bb.1:
%11 = V_MUL_F32_e64 0, %7, 0, %8, 0, 0, implicit $mode, implicit $exec
%12 = V_LSHLREV_B32_e64 16, %11, implicit $exec
- bb.1:
+ bb.2:
%13 = V_OR_B32_e64 %10, %12, implicit $exec
FLAT_STORE_DWORD %0, %13, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
>From b196f5647e0bda116ecddcd73c40000a9478df53 Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin at gmail.com>
Date: Thu, 22 Feb 2024 18:15:17 +0100
Subject: [PATCH 3/3] remove registers section, restore unrelated space change.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 1 +
llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir | 43 +++++++---------------
2 files changed, 15 insertions(+), 29 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 8d9053a9df1c00..afc380b4203457 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -814,6 +814,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
return std::make_unique<SDWADstPreserveOperand>(
OrDst, OrSDWADef, OrOtherDef, DstSel);
+
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
index 7746afc3e6b264..4c61e6803febf3 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
@@ -163,21 +163,6 @@ body: |
---
name: add_f16_u32_preserve_different_bb
tracksRegLiveness: true
-registers:
- - { id: 0, class: vreg_64 }
- - { id: 1, class: vreg_64 }
- - { id: 2, class: sreg_64 }
- - { id: 3, class: vgpr_32 }
- - { id: 4, class: vgpr_32 }
- - { id: 5, class: vgpr_32 }
- - { id: 6, class: vgpr_32 }
- - { id: 7, class: vgpr_32 }
- - { id: 8, class: vgpr_32 }
- - { id: 9, class: vgpr_32 }
- - { id: 10, class: vgpr_32 }
- - { id: 11, class: vgpr_32 }
- - { id: 12, class: vgpr_32 }
- - { id: 13, class: vgpr_32 }
body: |
; SDWA-LABEL: name: add_f16_u32_preserve_different_bb
; SDWA: bb.0:
@@ -207,26 +192,26 @@ body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
- %2 = COPY $sgpr30_sgpr31
- %1 = COPY $vgpr2_vgpr3
- %0 = COPY $vgpr0_vgpr1
- %3 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
- %4 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+ %2:sreg_64 = COPY $sgpr30_sgpr31
+ %1:vreg_64 = COPY $vgpr2_vgpr3
+ %0:vreg_64 = COPY $vgpr0_vgpr1
+ %3:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+ %4:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
- %5 = V_AND_B32_e32 65535, %3, implicit $exec
- %6 = V_LSHRREV_B32_e64 16, %4, implicit $exec
- %7 = V_BFE_U32_e64 %3, 8, 8, implicit $exec
- %8 = V_LSHRREV_B32_e32 24, %4, implicit $exec
+ %5:vgpr_32 = V_AND_B32_e32 65535, %3, implicit $exec
+ %6:vgpr_32 = V_LSHRREV_B32_e64 16, %4, implicit $exec
+ %7:vgpr_32 = V_BFE_U32_e64 %3, 8, 8, implicit $exec
+ %8:vgpr_32 = V_LSHRREV_B32_e32 24, %4, implicit $exec
- %9 = V_ADD_F16_e64 0, %5, 0, %6, 0, 0, implicit $mode, implicit $exec
- %10 = V_LSHLREV_B16_e64 8, %9, implicit $exec
+ %9:vgpr_32 = V_ADD_F16_e64 0, %5, 0, %6, 0, 0, implicit $mode, implicit $exec
+ %10:vgpr_32 = V_LSHLREV_B16_e64 8, %9, implicit $exec
bb.1:
- %11 = V_MUL_F32_e64 0, %7, 0, %8, 0, 0, implicit $mode, implicit $exec
- %12 = V_LSHLREV_B32_e64 16, %11, implicit $exec
+ %11:vgpr_32 = V_MUL_F32_e64 0, %7, 0, %8, 0, 0, implicit $mode, implicit $exec
+ %12:vgpr_32 = V_LSHLREV_B32_e64 16, %11, implicit $exec
bb.2:
- %13 = V_OR_B32_e64 %10, %12, implicit $exec
+ %13:vgpr_32 = V_OR_B32_e64 %10, %12, implicit $exec
FLAT_STORE_DWORD %0, %13, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
$sgpr30_sgpr31 = COPY %2
More information about the llvm-commits
mailing list